Skip to content

Handlers Module

gigaspatial.handlers

base

BaseHandler

Bases: ABC

Abstract base class that orchestrates configuration, downloading, and reading functionality.

This class serves as the main entry point for dataset handlers, providing a unified interface for data acquisition and loading. It manages the lifecycle of config, downloader, and reader components.

Subclasses should implement the abstract methods to provide specific handler types and define how components are created and interact.

Source code in gigaspatial/handlers/base.py
class BaseHandler(ABC):
    """
    Abstract base class that orchestrates configuration, downloading, and reading functionality.

    This class serves as the main entry point for dataset handlers, providing a unified
    interface for data acquisition and loading. It manages the lifecycle of config,
    downloader, and reader components.

    Subclasses should implement the abstract methods to provide specific handler types
    and define how components are created and interact.
    """

    def __init__(
        self,
        config: Optional[BaseHandlerConfig] = None,
        downloader: Optional[BaseHandlerDownloader] = None,
        reader: Optional[BaseHandlerReader] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        """
        Initialize the BaseHandler with optional components.

        Args:
            config: Configuration object. If None, will be created via create_config()
            downloader: Downloader instance. If None, will be created via create_downloader()
            reader: Reader instance. If None, will be created via create_reader()
            data_store: Data store instance. Defaults to LocalDataStore if not provided
            logger: Logger instance. If not provided, creates one based on class name
        """
        # Initialize data store first as it's used by other components
        self.data_store = data_store or LocalDataStore()

        # Initialize logger
        self.logger = logger or global_config.get_logger(self.__class__.__name__)

        # Initialize or create config
        self._config = config
        if self._config is None:
            self._config = self.create_config(
                data_store=self.data_store, logger=self.logger
            )

        # Initialize or create downloader
        self._downloader = downloader
        if self._downloader is None:
            self._downloader = self.create_downloader(
                config=self._config, data_store=self.data_store, logger=self.logger
            )

        # Initialize or create reader
        self._reader = reader
        if self._reader is None:
            self._reader = self.create_reader(
                config=self._config, data_store=self.data_store, logger=self.logger
            )

    @property
    def config(self) -> BaseHandlerConfig:
        """Get the configuration object."""
        return self._config

    @property
    def downloader(self) -> BaseHandlerDownloader:
        """Get the downloader object."""
        return self._downloader

    @property
    def reader(self) -> BaseHandlerReader:
        """Get the reader object."""
        return self._reader

    # Abstract factory methods for creating components
    @abstractmethod
    def create_config(
        self, data_store: DataStore, logger: logging.Logger, **kwargs
    ) -> BaseHandlerConfig:
        """
        Create and return a configuration object for this handler.

        Args:
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional configuration parameters

        Returns:
            Configured BaseHandlerConfig instance
        """
        pass

    @abstractmethod
    def create_downloader(
        self,
        config: BaseHandlerConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> BaseHandlerDownloader:
        """
        Create and return a downloader object for this handler.

        Args:
            config: The configuration object
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional downloader parameters

        Returns:
            Configured BaseHandlerDownloader instance
        """
        pass

    @abstractmethod
    def create_reader(
        self,
        config: BaseHandlerConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> BaseHandlerReader:
        """
        Create and return a reader object for this handler.

        Args:
            config: The configuration object
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional reader parameters

        Returns:
            Configured BaseHandlerReader instance
        """
        pass

    # High-level interface methods
    def ensure_data_available(
        self,
        source: Union[
            str,  # country
            List[Union[tuple, Point]],  # points
            BaseGeometry,  # geometry
            gpd.GeoDataFrame,  # geodataframe
            Path,  # path
            List[Union[str, Path]],  # list of paths
        ],
        force_download: bool = False,
        **kwargs,
    ) -> bool:
        """
        Ensure that data is available for the given source.

        This method checks if the required data exists locally, and if not (or if
        force_download is True), downloads it using the downloader.

        Args:
            source: The data source specification
            force_download: If True, download even if data exists locally
            **kwargs: Additional parameters passed to download methods

        Returns:
            bool: True if data is available after this operation
        """
        try:
            # Get relevant units (cached if already computed for this source)
            data_units = self.config.get_relevant_data_units(
                source, force_recompute=force_download, **kwargs
            )
            data_paths = self.config.get_data_unit_paths(data_units, **kwargs)

            # Check if data exists (unless force download)
            if not force_download:
                missing_paths = [
                    path
                    for path in data_paths
                    if not self.data_store.file_exists(str(path))
                ]
            else:
                # If force_download, treat all as missing
                missing_paths = data_paths

            if not missing_paths:
                self.logger.info("All required data is already available")
                return True

            # Map units to paths (assumes correspondence order; adapt if needed)
            path_to_unit = dict(zip(data_paths, data_units))
            if force_download:
                units_to_download = data_units
            else:
                units_to_download = [
                    path_to_unit[p] for p in missing_paths if p in path_to_unit
                ]

            if units_to_download:
                self.downloader.download_data_units(units_to_download, **kwargs)
            else:
                # Fallback - download by source if unit mapping isn't available
                self.downloader.download(source, **kwargs)

            # After attempted download, check again
            remaining_missing = [
                path
                for path in data_paths
                if not self.data_store.file_exists(str(path))
            ]
            if remaining_missing:
                self.logger.error(
                    f"Some data still missing after download: {remaining_missing}"
                )
                return False

            return True
        except Exception as e:
            self.logger.error(f"Failed to ensure data availability: {e}")
            return False

    def load_data(
        self,
        source: Union[
            str,  # country
            List[Union[tuple, Point]],  # points
            BaseGeometry,  # geometry
            gpd.GeoDataFrame,  # geodataframe
            Path,  # path
            List[Union[str, Path]],  # list of paths
        ],
        crop_to_source: bool = False,
        ensure_available: bool = True,
        **kwargs,
    ) -> Any:
        """
        Load data from the given source.

        Args:
            source: The data source specification
            ensure_available: If True, ensure data is downloaded before loading
            **kwargs: Additional parameters passed to load methods

        Returns:
            Loaded data (type depends on specific handler implementation)
        """
        if ensure_available:
            if not self.ensure_data_available(source, **kwargs):
                raise RuntimeError("Could not ensure data availability for loading")

        return self.reader.load(source, crop_to_source=crop_to_source, **kwargs)

    def download_and_load(
        self,
        source: Union[
            str,  # country
            List[Union[tuple, Point]],  # points
            BaseGeometry,  # geometry
            gpd.GeoDataFrame,  # geodataframe
            Path,  # path
            List[Union[str, Path]],  # list of paths
        ],
        crop_to_source: bool = False,
        force_download: bool = False,
        **kwargs,
    ) -> Any:
        """
        Convenience method to download (if needed) and load data in one call.

        Args:
            source: The data source specification
            force_download: If True, download even if data exists locally
            **kwargs: Additional parameters

        Returns:
            Loaded data
        """
        self.ensure_data_available(source, force_download=force_download, **kwargs)
        return self.reader.load(source, crop_to_source=crop_to_source, **kwargs)

    def get_available_data_info(
        self,
        source: Union[
            str,  # country
            List[Union[tuple, Point]],  # points
            BaseGeometry,  # geometry
            gpd.GeoDataFrame,  # geodataframe
        ],
        **kwargs,
    ) -> dict:
        """
        Get information about available data for the given source.

        Args:
            source: The data source specification
            **kwargs: Additional parameters

        Returns:
            dict: Information about data availability, paths, etc.
        """
        try:
            if hasattr(self.config, "get_relevant_data_units"):
                data_units = self.config.get_relevant_data_units(source, **kwargs)
                data_paths = self.config.get_data_unit_paths(data_units, **kwargs)
            else:
                data_paths = self.reader.resolve_source_paths(source, **kwargs)

            existing_paths = [
                path for path in data_paths if self.data_store.file_exists(str(path))
            ]
            missing_paths = [
                path
                for path in data_paths
                if not self.data_store.file_exists(str(path))
            ]

            return {
                "total_data_units": len(data_paths),
                "available_data_units": len(existing_paths),
                "missing_data_units": len(missing_paths),
                "available_paths": existing_paths,
                "missing_paths": missing_paths,
                "all_available": len(missing_paths) == 0,
            }

        except Exception as e:
            self.logger.error(f"Failed to get data info: {e}")
            return {
                "error": str(e),
                "total_data_units": 0,
                "available_data_units": 0,
                "missing_data_units": 0,
                "available_paths": [],
                "missing_paths": [],
                "all_available": False,
            }

    def cleanup(self):
        """
        Cleanup resources used by the handler.

        Override in subclasses if specific cleanup is needed.
        """
        self.logger.info(f"Cleaning up {self.__class__.__name__}")
        # Subclasses can override to add specific cleanup logic

    def __enter__(self):
        """Context manager entry."""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit."""
        self.cleanup()

    def __repr__(self) -> str:
        """String representation of the handler."""
        return (
            f"{self.__class__.__name__}("
            f"config={self.config.__class__.__name__}, "
            f"downloader={self.downloader.__class__.__name__}, "
            f"reader={self.reader.__class__.__name__})"
        )
config: BaseHandlerConfig property

Get the configuration object.

downloader: BaseHandlerDownloader property

Get the downloader object.

reader: BaseHandlerReader property

Get the reader object.

__enter__()

Context manager entry.

Source code in gigaspatial/handlers/base.py
def __enter__(self):
    """Context manager entry."""
    return self
__exit__(exc_type, exc_val, exc_tb)

Context manager exit.

Source code in gigaspatial/handlers/base.py
def __exit__(self, exc_type, exc_val, exc_tb):
    """Context manager exit."""
    self.cleanup()
__init__(config=None, downloader=None, reader=None, data_store=None, logger=None)

Initialize the BaseHandler with optional components.

Parameters:

Name Type Description Default
config Optional[BaseHandlerConfig]

Configuration object. If None, will be created via create_config()

None
downloader Optional[BaseHandlerDownloader]

Downloader instance. If None, will be created via create_downloader()

None
reader Optional[BaseHandlerReader]

Reader instance. If None, will be created via create_reader()

None
data_store Optional[DataStore]

Data store instance. Defaults to LocalDataStore if not provided

None
logger Optional[Logger]

Logger instance. If not provided, creates one based on class name

None
Source code in gigaspatial/handlers/base.py
def __init__(
    self,
    config: Optional[BaseHandlerConfig] = None,
    downloader: Optional[BaseHandlerDownloader] = None,
    reader: Optional[BaseHandlerReader] = None,
    data_store: Optional[DataStore] = None,
    logger: Optional[logging.Logger] = None,
):
    """
    Initialize the BaseHandler with optional components.

    Args:
        config: Configuration object. If None, will be created via create_config()
        downloader: Downloader instance. If None, will be created via create_downloader()
        reader: Reader instance. If None, will be created via create_reader()
        data_store: Data store instance. Defaults to LocalDataStore if not provided
        logger: Logger instance. If not provided, creates one based on class name
    """
    # Initialize data store first as it's used by other components
    self.data_store = data_store or LocalDataStore()

    # Initialize logger
    self.logger = logger or global_config.get_logger(self.__class__.__name__)

    # Initialize or create config
    self._config = config
    if self._config is None:
        self._config = self.create_config(
            data_store=self.data_store, logger=self.logger
        )

    # Initialize or create downloader
    self._downloader = downloader
    if self._downloader is None:
        self._downloader = self.create_downloader(
            config=self._config, data_store=self.data_store, logger=self.logger
        )

    # Initialize or create reader
    self._reader = reader
    if self._reader is None:
        self._reader = self.create_reader(
            config=self._config, data_store=self.data_store, logger=self.logger
        )
__repr__()

String representation of the handler.

Source code in gigaspatial/handlers/base.py
def __repr__(self) -> str:
    """String representation of the handler."""
    return (
        f"{self.__class__.__name__}("
        f"config={self.config.__class__.__name__}, "
        f"downloader={self.downloader.__class__.__name__}, "
        f"reader={self.reader.__class__.__name__})"
    )
cleanup()

Cleanup resources used by the handler.

Override in subclasses if specific cleanup is needed.

Source code in gigaspatial/handlers/base.py
def cleanup(self):
    """
    Cleanup resources used by the handler.

    Override in subclasses if specific cleanup is needed.
    """
    self.logger.info(f"Cleaning up {self.__class__.__name__}")
create_config(data_store, logger, **kwargs) abstractmethod

Create and return a configuration object for this handler.

Parameters:

Name Type Description Default
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional configuration parameters

{}

Returns:

Type Description
BaseHandlerConfig

Configured BaseHandlerConfig instance

Source code in gigaspatial/handlers/base.py
@abstractmethod
def create_config(
    self, data_store: DataStore, logger: logging.Logger, **kwargs
) -> BaseHandlerConfig:
    """
    Create and return a configuration object for this handler.

    Args:
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional configuration parameters

    Returns:
        Configured BaseHandlerConfig instance
    """
    pass
create_downloader(config, data_store, logger, **kwargs) abstractmethod

Create and return a downloader object for this handler.

Parameters:

Name Type Description Default
config BaseHandlerConfig

The configuration object

required
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional downloader parameters

{}

Returns:

Type Description
BaseHandlerDownloader

Configured BaseHandlerDownloader instance

Source code in gigaspatial/handlers/base.py
@abstractmethod
def create_downloader(
    self,
    config: BaseHandlerConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> BaseHandlerDownloader:
    """
    Create and return a downloader object for this handler.

    Args:
        config: The configuration object
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional downloader parameters

    Returns:
        Configured BaseHandlerDownloader instance
    """
    pass
create_reader(config, data_store, logger, **kwargs) abstractmethod

Create and return a reader object for this handler.

Parameters:

Name Type Description Default
config BaseHandlerConfig

The configuration object

required
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional reader parameters

{}

Returns:

Type Description
BaseHandlerReader

Configured BaseHandlerReader instance

Source code in gigaspatial/handlers/base.py
@abstractmethod
def create_reader(
    self,
    config: BaseHandlerConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> BaseHandlerReader:
    """
    Create and return a reader object for this handler.

    Args:
        config: The configuration object
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional reader parameters

    Returns:
        Configured BaseHandlerReader instance
    """
    pass
download_and_load(source, crop_to_source=False, force_download=False, **kwargs)

Convenience method to download (if needed) and load data in one call.

Parameters:

Name Type Description Default
source Union[str, List[Union[tuple, Point]], BaseGeometry, GeoDataFrame, Path, List[Union[str, Path]]]

The data source specification

required
force_download bool

If True, download even if data exists locally

False
**kwargs

Additional parameters

{}

Returns:

Type Description
Any

Loaded data

Source code in gigaspatial/handlers/base.py
def download_and_load(
    self,
    source: Union[
        str,  # country
        List[Union[tuple, Point]],  # points
        BaseGeometry,  # geometry
        gpd.GeoDataFrame,  # geodataframe
        Path,  # path
        List[Union[str, Path]],  # list of paths
    ],
    crop_to_source: bool = False,
    force_download: bool = False,
    **kwargs,
) -> Any:
    """
    Convenience method to download (if needed) and load data in one call.

    Args:
        source: The data source specification
        force_download: If True, download even if data exists locally
        **kwargs: Additional parameters

    Returns:
        Loaded data
    """
    self.ensure_data_available(source, force_download=force_download, **kwargs)
    return self.reader.load(source, crop_to_source=crop_to_source, **kwargs)
ensure_data_available(source, force_download=False, **kwargs)

Ensure that data is available for the given source.

This method checks if the required data exists locally, and if not (or if force_download is True), downloads it using the downloader.

Parameters:

Name Type Description Default
source Union[str, List[Union[tuple, Point]], BaseGeometry, GeoDataFrame, Path, List[Union[str, Path]]]

The data source specification

required
force_download bool

If True, download even if data exists locally

False
**kwargs

Additional parameters passed to download methods

{}

Returns:

Name Type Description
bool bool

True if data is available after this operation

Source code in gigaspatial/handlers/base.py
def ensure_data_available(
    self,
    source: Union[
        str,  # country
        List[Union[tuple, Point]],  # points
        BaseGeometry,  # geometry
        gpd.GeoDataFrame,  # geodataframe
        Path,  # path
        List[Union[str, Path]],  # list of paths
    ],
    force_download: bool = False,
    **kwargs,
) -> bool:
    """
    Ensure that data is available for the given source.

    This method checks if the required data exists locally, and if not (or if
    force_download is True), downloads it using the downloader.

    Args:
        source: The data source specification
        force_download: If True, download even if data exists locally
        **kwargs: Additional parameters passed to download methods

    Returns:
        bool: True if data is available after this operation
    """
    try:
        # Get relevant units (cached if already computed for this source)
        data_units = self.config.get_relevant_data_units(
            source, force_recompute=force_download, **kwargs
        )
        data_paths = self.config.get_data_unit_paths(data_units, **kwargs)

        # Check if data exists (unless force download)
        if not force_download:
            missing_paths = [
                path
                for path in data_paths
                if not self.data_store.file_exists(str(path))
            ]
        else:
            # If force_download, treat all as missing
            missing_paths = data_paths

        if not missing_paths:
            self.logger.info("All required data is already available")
            return True

        # Map units to paths (assumes correspondence order; adapt if needed)
        path_to_unit = dict(zip(data_paths, data_units))
        if force_download:
            units_to_download = data_units
        else:
            units_to_download = [
                path_to_unit[p] for p in missing_paths if p in path_to_unit
            ]

        if units_to_download:
            self.downloader.download_data_units(units_to_download, **kwargs)
        else:
            # Fallback - download by source if unit mapping isn't available
            self.downloader.download(source, **kwargs)

        # After attempted download, check again
        remaining_missing = [
            path
            for path in data_paths
            if not self.data_store.file_exists(str(path))
        ]
        if remaining_missing:
            self.logger.error(
                f"Some data still missing after download: {remaining_missing}"
            )
            return False

        return True
    except Exception as e:
        self.logger.error(f"Failed to ensure data availability: {e}")
        return False
get_available_data_info(source, **kwargs)

Get information about available data for the given source.

Parameters:

Name Type Description Default
source Union[str, List[Union[tuple, Point]], BaseGeometry, GeoDataFrame]

The data source specification

required
**kwargs

Additional parameters

{}

Returns:

Name Type Description
dict dict

Information about data availability, paths, etc.

Source code in gigaspatial/handlers/base.py
def get_available_data_info(
    self,
    source: Union[
        str,  # country
        List[Union[tuple, Point]],  # points
        BaseGeometry,  # geometry
        gpd.GeoDataFrame,  # geodataframe
    ],
    **kwargs,
) -> dict:
    """
    Get information about available data for the given source.

    Args:
        source: The data source specification
        **kwargs: Additional parameters

    Returns:
        dict: Information about data availability, paths, etc.
    """
    try:
        if hasattr(self.config, "get_relevant_data_units"):
            data_units = self.config.get_relevant_data_units(source, **kwargs)
            data_paths = self.config.get_data_unit_paths(data_units, **kwargs)
        else:
            data_paths = self.reader.resolve_source_paths(source, **kwargs)

        existing_paths = [
            path for path in data_paths if self.data_store.file_exists(str(path))
        ]
        missing_paths = [
            path
            for path in data_paths
            if not self.data_store.file_exists(str(path))
        ]

        return {
            "total_data_units": len(data_paths),
            "available_data_units": len(existing_paths),
            "missing_data_units": len(missing_paths),
            "available_paths": existing_paths,
            "missing_paths": missing_paths,
            "all_available": len(missing_paths) == 0,
        }

    except Exception as e:
        self.logger.error(f"Failed to get data info: {e}")
        return {
            "error": str(e),
            "total_data_units": 0,
            "available_data_units": 0,
            "missing_data_units": 0,
            "available_paths": [],
            "missing_paths": [],
            "all_available": False,
        }
load_data(source, crop_to_source=False, ensure_available=True, **kwargs)

Load data from the given source.

Parameters:

Name Type Description Default
source Union[str, List[Union[tuple, Point]], BaseGeometry, GeoDataFrame, Path, List[Union[str, Path]]]

The data source specification

required
ensure_available bool

If True, ensure data is downloaded before loading

True
**kwargs

Additional parameters passed to load methods

{}

Returns:

Type Description
Any

Loaded data (type depends on specific handler implementation)

Source code in gigaspatial/handlers/base.py
def load_data(
    self,
    source: Union[
        str,  # country
        List[Union[tuple, Point]],  # points
        BaseGeometry,  # geometry
        gpd.GeoDataFrame,  # geodataframe
        Path,  # path
        List[Union[str, Path]],  # list of paths
    ],
    crop_to_source: bool = False,
    ensure_available: bool = True,
    **kwargs,
) -> Any:
    """
    Load data from the given source.

    Args:
        source: The data source specification
        ensure_available: If True, ensure data is downloaded before loading
        **kwargs: Additional parameters passed to load methods

    Returns:
        Loaded data (type depends on specific handler implementation)
    """
    if ensure_available:
        if not self.ensure_data_available(source, **kwargs):
            raise RuntimeError("Could not ensure data availability for loading")

    return self.reader.load(source, crop_to_source=crop_to_source, **kwargs)

BaseHandlerConfig dataclass

Bases: ABC

Abstract base class for handler configuration objects. Provides standard fields for path, parallelism, data store, and logger. Extend this class for dataset-specific configuration.

Source code in gigaspatial/handlers/base.py
@dataclass
class BaseHandlerConfig(ABC):
    """
    Abstract base class for handler configuration objects.
    Provides standard fields for path, parallelism, data store, and logger.
    Extend this class for dataset-specific configuration.
    """

    base_path: Path = None
    n_workers: int = multiprocessing.cpu_count()
    data_store: DataStore = field(default_factory=LocalDataStore)
    logger: logging.Logger = field(default=None, repr=False)

    def __post_init__(self):
        if self.logger is None:
            self.logger = global_config.get_logger(self.__class__.__name__)

        self._unit_cache = {}

    def _cache_key(self, source, **kwargs):
        """Create a canonical cache key from source."""
        if isinstance(source, str):
            return ("country", source)
        if isinstance(source, BaseGeometry):
            return ("geometry", source.wkt)
        if isinstance(source, gpd.GeoDataFrame):
            return ("geometry", str(source.geometry.unary_union.wkt))
        if isinstance(source, Iterable) and all(
            isinstance(p, (Point, tuple)) for p in source
        ):
            pt_str = tuple(
                (p.x, p.y) if isinstance(p, Point) else tuple(p) for p in source
            )
            return ("points", pt_str)
        return ("other", str(source))

    def get_relevant_data_units(
        self,
        source: Union[
            str,  # country
            List[Union[Tuple[float, float], Point]],  # points
            BaseGeometry,  # geometry
            gpd.GeoDataFrame,  # geodataframe
        ],
        force_recompute: bool = False,
        **kwargs,
    ):
        key = self._cache_key(source, **kwargs)

        # Check cache unless forced recompute
        if not force_recompute and key in self._unit_cache:
            self.logger.debug(f"Using cached units for {key[0]}: {key[1][:50]}...")
            units, _ = self._unit_cache[key]  # Unpack tuple, only return units
            return units

        # Convert source to geometry and compute units
        geometry = self.extract_search_geometry(source, **kwargs)
        units = self.get_relevant_data_units_by_geometry(geometry, **kwargs)

        # Cache both units and geometry as tuple
        self._unit_cache[key] = (units, geometry)
        return units

    @abstractmethod
    def get_relevant_data_units_by_geometry(
        self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
    ) -> Any:
        """
        Given a geometry, return a list of relevant data unit identifiers (e.g., tiles, files, resources).
        """
        pass

    @abstractmethod
    def get_data_unit_path(self, unit: Any, **kwargs) -> list:
        """
        Given a data unit identifier, return the corresponding file path.
        """
        pass

    def get_data_unit_paths(self, units: Union[Iterable[Any]], **kwargs) -> list:
        """
        Given data unit identifiers, return the corresponding file paths.
        """
        if not isinstance(units, Iterable):
            units = [units]

        if not units:
            return []

        return [self.get_data_unit_path(unit=unit, **kwargs) for unit in units]

    def extract_search_geometry(self, source, **kwargs):
        """General method to extract a canonical geometry from supported source types."""
        if isinstance(source, str):
            # Use the admin boundary as geometry
            from gigaspatial.handlers.boundaries import AdminBoundaries

            return (
                AdminBoundaries.create(country_code=source, **kwargs)
                .boundaries[0]
                .geometry
            )
        elif isinstance(source, gpd.GeoDataFrame):
            if crs := kwargs.get("crs", None):

                if not source.crs:
                    raise ValueError(
                        "Cannot extract search geometry. Please set a crs on the source object first."
                    )

                if source.crs != crs:
                    source = source.to_crs(crs)

            return source.geometry.union_all()
        elif isinstance(
            source,
            BaseGeometry,
        ):
            return source
        elif isinstance(source, Iterable) and all(
            isinstance(p, (Point, Iterable)) for p in source
        ):
            points = [p if isinstance(p, Point) else Point(p[1], p[0]) for p in source]
            return MultiPoint(points)
        else:
            raise ValueError(f"Unsupported source type: {type(source)}")

    def get_cached_search_geometry(self, source):
        key = self._cache_key(source)
        result = self._unit_cache.get(key)
        if result:
            _, geometry = result
            return geometry
        return None

    def clear_unit_cache(self):
        """Clear cached units."""
        self._unit_cache.clear()
        self.logger.debug("Unit cache cleared")
clear_unit_cache()

Clear cached units.

Source code in gigaspatial/handlers/base.py
def clear_unit_cache(self):
    """Clear cached units."""
    self._unit_cache.clear()
    self.logger.debug("Unit cache cleared")
extract_search_geometry(source, **kwargs)

General method to extract a canonical geometry from supported source types.

Source code in gigaspatial/handlers/base.py
def extract_search_geometry(self, source, **kwargs):
    """General method to extract a canonical geometry from supported source types."""
    if isinstance(source, str):
        # Use the admin boundary as geometry
        from gigaspatial.handlers.boundaries import AdminBoundaries

        return (
            AdminBoundaries.create(country_code=source, **kwargs)
            .boundaries[0]
            .geometry
        )
    elif isinstance(source, gpd.GeoDataFrame):
        if crs := kwargs.get("crs", None):

            if not source.crs:
                raise ValueError(
                    "Cannot extract search geometry. Please set a crs on the source object first."
                )

            if source.crs != crs:
                source = source.to_crs(crs)

        return source.geometry.union_all()
    elif isinstance(
        source,
        BaseGeometry,
    ):
        return source
    elif isinstance(source, Iterable) and all(
        isinstance(p, (Point, Iterable)) for p in source
    ):
        points = [p if isinstance(p, Point) else Point(p[1], p[0]) for p in source]
        return MultiPoint(points)
    else:
        raise ValueError(f"Unsupported source type: {type(source)}")
get_data_unit_path(unit, **kwargs) abstractmethod

Given a data unit identifier, return the corresponding file path.

Source code in gigaspatial/handlers/base.py
@abstractmethod
def get_data_unit_path(self, unit: Any, **kwargs) -> list:
    """
    Given a data unit identifier, return the corresponding file path.
    """
    pass
get_data_unit_paths(units, **kwargs)

Given data unit identifiers, return the corresponding file paths.

Source code in gigaspatial/handlers/base.py
def get_data_unit_paths(self, units: Union[Iterable[Any]], **kwargs) -> list:
    """
    Given data unit identifiers, return the corresponding file paths.
    """
    if not isinstance(units, Iterable):
        units = [units]

    if not units:
        return []

    return [self.get_data_unit_path(unit=unit, **kwargs) for unit in units]
get_relevant_data_units_by_geometry(geometry, **kwargs) abstractmethod

Given a geometry, return a list of relevant data unit identifiers (e.g., tiles, files, resources).

Source code in gigaspatial/handlers/base.py
@abstractmethod
def get_relevant_data_units_by_geometry(
    self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
) -> Any:
    """
    Given a geometry, return a list of relevant data unit identifiers (e.g., tiles, files, resources).
    """
    pass

BaseHandlerDownloader

Bases: ABC

Abstract base class for handler downloader classes. Standardizes config, data_store, and logger initialization. Extend this class for dataset-specific downloaders.

Source code in gigaspatial/handlers/base.py
class BaseHandlerDownloader(ABC):
    """
    Abstract base class for handler downloader classes.
    Standardizes config, data_store, and logger initialization.
    Extend this class for dataset-specific downloaders.
    """

    def __init__(
        self,
        config: Optional[BaseHandlerConfig] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        self.config = config
        if data_store:
            self.data_store = data_store
        elif config and hasattr(config, "data_store"):
            self.data_store = config.data_store
        else:
            self.data_store = LocalDataStore()

        self.logger = (
            logger
            or (getattr(config, "logger", None) if config else None)
            or global_config.get_logger(self.__class__.__name__)
        )

    @abstractmethod
    def download_data_unit(self, *args, **kwargs):
        """
        Abstract method to download data. Implement in subclasses.
        """
        pass

    @abstractmethod
    def download_data_units(self, *args, **kwargs):
        """
        Abstract method to download data. Implement in subclasses.
        """
        pass

    def download(self, source, **kwargs):
        """
        Given source download the data.
        """
        units = self.config.get_relevant_data_units(source, **kwargs)
        return self.download_data_units(units, **kwargs)
download(source, **kwargs)

Given source download the data.

Source code in gigaspatial/handlers/base.py
def download(self, source, **kwargs):
    """
    Given source download the data.
    """
    units = self.config.get_relevant_data_units(source, **kwargs)
    return self.download_data_units(units, **kwargs)
download_data_unit(*args, **kwargs) abstractmethod

Abstract method to download data. Implement in subclasses.

Source code in gigaspatial/handlers/base.py
@abstractmethod
def download_data_unit(self, *args, **kwargs):
    """
    Abstract method to download data. Implement in subclasses.
    """
    pass
download_data_units(*args, **kwargs) abstractmethod

Abstract method to download data. Implement in subclasses.

Source code in gigaspatial/handlers/base.py
@abstractmethod
def download_data_units(self, *args, **kwargs):
    """
    Abstract method to download data. Implement in subclasses.
    """
    pass

BaseHandlerReader

Bases: ABC

Abstract base class for handler reader classes. Provides common methods for resolving source paths and loading data. Supports resolving by country, points, geometry, GeoDataFrame, or explicit paths. Includes generic loader functions for raster and tabular data.

Source code in gigaspatial/handlers/base.py
class BaseHandlerReader(ABC):
    """
    Abstract base class for handler reader classes.
    Provides common methods for resolving source paths and loading data.
    Supports resolving by country, points, geometry, GeoDataFrame, or explicit paths.
    Includes generic loader functions for raster and tabular data.
    """

    def __init__(
        self,
        config: Optional[BaseHandlerConfig] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        self.config = config
        if data_store:
            self.data_store = data_store
        elif config and hasattr(config, "data_store"):
            self.data_store = config.data_store
        else:
            self.data_store = LocalDataStore()

        self.logger = (
            logger
            or (getattr(config, "logger", None) if config else None)
            or global_config.get_logger(self.__class__.__name__)
        )

    def resolve_source_paths(
        self,
        source: Union[
            str,  # country code
            List[Union[Tuple[float, float], Point]],  # points
            BaseGeometry,
            gpd.GeoDataFrame,
            Path,  # path
            str,  # path
            List[Union[str, Path]],
        ],
        **kwargs,
    ) -> List[Union[str, Path]]:
        """
        Resolve source data paths based on the type of source input.

        Args:
            source: Can be a country code or name (str), list of points, geometry, GeoDataFrame, or explicit path(s)
            **kwargs: Additional parameters for path resolution

        Returns:
            List of resolved source paths
        """
        if (
            isinstance(source, Path)
            or (
                isinstance(source, (list, tuple, set))
                and all(isinstance(p, (str, Path)) for p in source)
            )
            or (isinstance(source, str) and "." in source)
        ):
            return self.resolve_by_paths(source)

        data_units = self.config.get_relevant_data_units(source, **kwargs)
        data_paths = self.config.get_data_unit_paths(data_units, **kwargs)

        self.logger.info(f"Resolved {len(data_paths)} paths!")
        return data_paths

    def resolve_by_paths(
        self, paths: Union[Path, str, List[Union[str, Path]]], **kwargs
    ) -> List[Union[str, Path]]:
        """
        Return explicit paths as a list.
        """
        if isinstance(paths, (str, Path)):
            return [paths]
        return list(paths)

    def _pre_load_hook(self, source_data_path, **kwargs) -> Any:
        """Hook called before loading data."""
        if isinstance(source_data_path, (Path, str)):
            source_data_path = [source_data_path]

        if not source_data_path:
            self.logger.warning("No paths found!")
            return []

        source_data_paths = [str(file_path) for file_path in source_data_path]

        self.logger.info(
            f"Pre-loading validation complete for {len(source_data_path)} files"
        )
        return source_data_paths

    def _post_load_hook(self, data, **kwargs) -> Any:
        """Hook called after loading data."""
        if isinstance(data, Iterable):
            if len(data) == 0:
                self.logger.warning("No data was loaded from the source files")
                return data

            self.logger.info(f"{len(data)} valid data records.")

        self.logger.info(f"Post-load processing complete.")

        return data

    def _check_file_exists(self, file_paths: List[Union[str, Path]]):
        """
        Check that all specified files exist in the data store.

        Args:
            file_paths (List[Union[str, Path]]): List of file paths to check.

        Raises:
            RuntimeError: If any file does not exist in the data store.
        """
        for file_path in file_paths:
            if not self.data_store.file_exists(str(file_path)):
                raise RuntimeError(
                    f"Source file does not exist in the data store: {file_path}"
                )

    def _load_raster_data(
        self,
        raster_paths: List[Union[str, Path]],
        merge_rasters: bool = False,
        **kwargs,
    ) -> Union[List[TifProcessor], TifProcessor]:
        """
        Load raster data from file paths.

        Args:
            raster_paths (List[Union[str, Path]]): List of file paths to raster files.
            merge_rasters (bool): If True, all rasters will be merged into a single TifProcessor.
                                  Defaults to False.

        Returns:
            Union[List[TifProcessor], TifProcessor]: List of TifProcessor objects or a single
                                                    TifProcessor if merge_rasters is True.
        """
        if merge_rasters or len(raster_paths) == 1:
            self.logger.info(
                f"Merging {len(raster_paths)} rasters into a single TifProcessor."
            )
            return TifProcessor(raster_paths, self.data_store, **kwargs)
        else:
            return [
                TifProcessor(data_path, self.data_store, **kwargs)
                for data_path in raster_paths
            ]

    def _load_tabular_data(
        self,
        file_paths: List[Union[str, Path]],
        read_function: Callable = read_dataset,
        show_progress: bool = True,
        progress_desc: Optional[str] = None,
        **kwargs,
    ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
        """
        Load and concatenate tabular data from multiple files.

        Args:
            file_paths (List[Union[str, Path]]): List of file paths to load data from.
            read_function (Callable): Function to use for reading individual files.
                Defaults to read_dataset. Should accept (data_store, file_path) arguments.
            show_progress (bool): Whether to display a progress bar while loading files.
            progress_desc (Optional[str]): Custom description for the progress bar.

        Returns:
            Union[pd.DataFrame, gpd.GeoDataFrame]: Concatenated data from all files.
                Returns empty DataFrame if no data is loaded.
        """
        all_data = []
        iterator: Iterable = file_paths
        if show_progress and file_paths:
            iterator = tqdm(
                file_paths,
                desc=progress_desc or "Loading tabular data",
                total=len(file_paths),
            )

        for file_path in iterator:
            all_data.append(read_function(self.data_store, file_path, **kwargs))

        if not all_data:
            return pd.DataFrame()
        result = pd.concat(all_data, ignore_index=True)
        return result

    def crop_to_geometry(self, data, geometry, predicate="intersects", **kwargs):

        # Project geometry to the projection of the data if data has projection
        geom_crs = kwargs.get("crs", "EPSG:4326")
        if hasattr(data, "crs") and data.crs != geom_crs:
            geometry = (
                gpd.GeoDataFrame(geometry=[geometry], crs="EPSG:4326")
                .to_crs(data.crs)
                .geometry[0]
            )

        # Tabular (GeoDataFrame) case
        if isinstance(data, (pd.DataFrame, gpd.GeoDataFrame)):
            if isinstance(data, pd.DataFrame):
                from gigaspatial.processing.geo import convert_to_geodataframe

                try:
                    data = convert_to_geodataframe(data, **kwargs)
                except:
                    return data

            # Clip to geometry
            return data[getattr(data.geometry, predicate)(geometry)]

        # Raster case
        if isinstance(data, TifProcessor):
            return data.clip_to_geometry(geometry=geometry, **kwargs)

        return data

    @abstractmethod
    def load_from_paths(
        self, source_data_path: List[Union[str, Path]], **kwargs
    ) -> Any:
        """
        Abstract method to load source data from paths.

        Args:
            source_data_path: List of source paths
            **kwargs: Additional parameters for data loading

        Returns:
            Loaded data (DataFrame, GeoDataFrame, etc.)
        """
        pass

    def load(
        self,
        source: Union[
            str,  # country
            List[Union[Tuple[float, float], Point]],  # points
            BaseGeometry,
            gpd.GeoDataFrame,
            Path,
            str,
            List[Union[str, Path]],
        ],
        crop_to_source: bool = False,
        **kwargs,
    ) -> Any:
        """
        Load data from the given source.

        Args:
            source: The data source (country code/name, points, geometry, paths, etc.).
            crop_to_source : bool, default False
                If True, crop loaded data to the exact source geometry
            **kwargs: Additional parameters to pass to the loading process.

        Returns:
            The loaded data. The type depends on the subclass implementation.
        """
        source_data_paths = self.resolve_source_paths(source, **kwargs)
        if not source_data_paths:
            self.logger.warning(
                "No source data paths resolved. There's no matching data to load!"
            )
            return None
        processed_paths = self._pre_load_hook(source_data_paths, **kwargs)
        if not processed_paths:
            self.logger.warning("No valid paths to load data from.")
            return None

        loaded_data = self.load_from_paths(processed_paths, **kwargs)
        loaded_data = self._post_load_hook(loaded_data, **kwargs)

        # Apply cropping if requested
        if crop_to_source and loaded_data is not None:
            search_geometry = self.config.get_cached_search_geometry(source)
            if search_geometry is not None and isinstance(
                search_geometry, BaseGeometry
            ):
                loaded_data = self.crop_to_geometry(loaded_data, search_geometry)
            else:
                # If no cached geometry, compute it
                search_geometry = self.config.extract_search_geometry(source, **kwargs)
                if isinstance(search_geometry, BaseGeometry):
                    loaded_data = self.crop_to_geometry(loaded_data, search_geometry)

        return loaded_data
load(source, crop_to_source=False, **kwargs)

Load data from the given source.

Parameters:

Name Type Description Default
source Union[str, List[Union[Tuple[float, float], Point]], BaseGeometry, GeoDataFrame, Path, str, List[Union[str, Path]]]

The data source (country code/name, points, geometry, paths, etc.).

required
crop_to_source

bool, default False If True, crop loaded data to the exact source geometry

False
**kwargs

Additional parameters to pass to the loading process.

{}

Returns:

Type Description
Any

The loaded data. The type depends on the subclass implementation.

Source code in gigaspatial/handlers/base.py
def load(
    self,
    source: Union[
        str,  # country
        List[Union[Tuple[float, float], Point]],  # points
        BaseGeometry,
        gpd.GeoDataFrame,
        Path,
        str,
        List[Union[str, Path]],
    ],
    crop_to_source: bool = False,
    **kwargs,
) -> Any:
    """
    Load data from the given source.

    Args:
        source: The data source (country code/name, points, geometry, paths, etc.).
        crop_to_source : bool, default False
            If True, crop loaded data to the exact source geometry
        **kwargs: Additional parameters to pass to the loading process.

    Returns:
        The loaded data. The type depends on the subclass implementation.
    """
    source_data_paths = self.resolve_source_paths(source, **kwargs)
    if not source_data_paths:
        self.logger.warning(
            "No source data paths resolved. There's no matching data to load!"
        )
        return None
    processed_paths = self._pre_load_hook(source_data_paths, **kwargs)
    if not processed_paths:
        self.logger.warning("No valid paths to load data from.")
        return None

    loaded_data = self.load_from_paths(processed_paths, **kwargs)
    loaded_data = self._post_load_hook(loaded_data, **kwargs)

    # Apply cropping if requested
    if crop_to_source and loaded_data is not None:
        search_geometry = self.config.get_cached_search_geometry(source)
        if search_geometry is not None and isinstance(
            search_geometry, BaseGeometry
        ):
            loaded_data = self.crop_to_geometry(loaded_data, search_geometry)
        else:
            # If no cached geometry, compute it
            search_geometry = self.config.extract_search_geometry(source, **kwargs)
            if isinstance(search_geometry, BaseGeometry):
                loaded_data = self.crop_to_geometry(loaded_data, search_geometry)

    return loaded_data
load_from_paths(source_data_path, **kwargs) abstractmethod

Abstract method to load source data from paths.

Parameters:

Name Type Description Default
source_data_path List[Union[str, Path]]

List of source paths

required
**kwargs

Additional parameters for data loading

{}

Returns:

Type Description
Any

Loaded data (DataFrame, GeoDataFrame, etc.)

Source code in gigaspatial/handlers/base.py
@abstractmethod
def load_from_paths(
    self, source_data_path: List[Union[str, Path]], **kwargs
) -> Any:
    """
    Abstract method to load source data from paths.

    Args:
        source_data_path: List of source paths
        **kwargs: Additional parameters for data loading

    Returns:
        Loaded data (DataFrame, GeoDataFrame, etc.)
    """
    pass
resolve_by_paths(paths, **kwargs)

Return explicit paths as a list.

Source code in gigaspatial/handlers/base.py
def resolve_by_paths(
    self, paths: Union[Path, str, List[Union[str, Path]]], **kwargs
) -> List[Union[str, Path]]:
    """
    Return explicit paths as a list.
    """
    if isinstance(paths, (str, Path)):
        return [paths]
    return list(paths)
resolve_source_paths(source, **kwargs)

Resolve source data paths based on the type of source input.

Parameters:

Name Type Description Default
source Union[str, List[Union[Tuple[float, float], Point]], BaseGeometry, GeoDataFrame, Path, str, List[Union[str, Path]]]

Can be a country code or name (str), list of points, geometry, GeoDataFrame, or explicit path(s)

required
**kwargs

Additional parameters for path resolution

{}

Returns:

Type Description
List[Union[str, Path]]

List of resolved source paths

Source code in gigaspatial/handlers/base.py
def resolve_source_paths(
    self,
    source: Union[
        str,  # country code
        List[Union[Tuple[float, float], Point]],  # points
        BaseGeometry,
        gpd.GeoDataFrame,
        Path,  # path
        str,  # path
        List[Union[str, Path]],
    ],
    **kwargs,
) -> List[Union[str, Path]]:
    """
    Resolve source data paths based on the type of source input.

    Args:
        source: Can be a country code or name (str), list of points, geometry, GeoDataFrame, or explicit path(s)
        **kwargs: Additional parameters for path resolution

    Returns:
        List of resolved source paths
    """
    if (
        isinstance(source, Path)
        or (
            isinstance(source, (list, tuple, set))
            and all(isinstance(p, (str, Path)) for p in source)
        )
        or (isinstance(source, str) and "." in source)
    ):
        return self.resolve_by_paths(source)

    data_units = self.config.get_relevant_data_units(source, **kwargs)
    data_paths = self.config.get_data_unit_paths(data_units, **kwargs)

    self.logger.info(f"Resolved {len(data_paths)} paths!")
    return data_paths

boundaries

AdminBoundaries

Bases: BaseModel

Base class for administrative boundary data with flexible fields.

Source code in gigaspatial/handlers/boundaries.py
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
class AdminBoundaries(BaseModel):
    """Base class for administrative boundary data with flexible fields."""

    boundaries: List[AdminBoundary] = Field(default_factory=list)
    level: int = Field(
        ...,
        ge=0,
        le=4,
        description="Administrative level (e.g., 0=country, 1=state, etc.)",
    )

    logger: ClassVar = global_config.get_logger("AdminBoundaries")

    _schema_config: ClassVar[Dict[str, Dict[str, str]]] = {
        "gadm": {
            "country_code": "GID_0",
            "id": "GID_{level}",
            "name": "NAME_{level}",
            "parent_id": "GID_{parent_level}",
        },
        "internal": {
            "id": "admin{level}_id_giga",
            "name": "name",
            "name_en": "name_en",
            "country_code": "iso_3166_1_alpha_3",
        },
        "geoBoundaries": {
            "id": "shapeID",
            "name": "shapeName",
            "country_code": "shapeGroup",
        },
    }

    def to_geodataframe(self) -> gpd.GeoDataFrame:
        """Convert the AdminBoundaries to a GeoDataFrame."""
        if not self.boundaries:
            if hasattr(self, "_empty_schema"):
                columns = self._empty_schema
            else:
                columns = ["id", "name", "country_code", "geometry"]
                if self.level > 0:
                    columns.append("parent_id")

            return gpd.GeoDataFrame(columns=columns, geometry="geometry", crs=4326)

        return gpd.GeoDataFrame(
            [boundary.model_dump() for boundary in self.boundaries],
            geometry="geometry",
            crs=4326,
        )

    @classmethod
    def get_schema_config(cls) -> Dict[str, Dict[str, str]]:
        """Return field mappings for different data sources"""
        return cls._schema_config

    @classmethod
    def from_gadm(
        cls, country_code: str, admin_level: int = 0, **kwargs
    ) -> "AdminBoundaries":
        """Load and create instance from GADM data."""
        url = f"https://geodata.ucdavis.edu/gadm/gadm4.1/json/gadm41_{country_code}_{admin_level}.json"
        cls.logger.info(
            f"Loading GADM data for country: {country_code}, admin level: {admin_level} from URL: {url}"
        )
        try:
            gdf = gpd.read_file(url)

            gdf = cls._map_fields(gdf, "gadm", admin_level)

            if admin_level == 0:
                gdf["country_code"] = gdf["id"]
                gdf["name"] = gdf["COUNTRY"]
            elif admin_level == 1:
                gdf["country_code"] = gdf["parent_id"]

            boundaries = [
                AdminBoundary(**row_dict) for row_dict in gdf.to_dict("records")
            ]
            cls.logger.info(f"Created {len(boundaries)} AdminBoundary objects.")
            return cls(
                boundaries=boundaries, level=admin_level, country_code=country_code
            )

        except (ValueError, HTTPError, FileNotFoundError) as e:
            cls.logger.warning(
                f"Error loading GADM data for {country_code} at admin level {admin_level}: {str(e)}"
            )
            cls.logger.info("Falling back to empty instance")
            return cls._create_empty_instance(country_code, admin_level, "gadm")

    @classmethod
    def from_data_store(
        cls,
        data_store: DataStore,
        path: Union[str, "Path"],
        admin_level: int = 0,
        **kwargs,
    ) -> "AdminBoundaries":
        """Load and create instance from internal data store."""
        cls.logger.info(
            f"Loading data from data store at path: {path}, admin level: {admin_level}"
        )
        try:
            gdf = read_dataset(data_store, str(path), **kwargs)

            if gdf.empty:
                cls.logger.warning(f"No data found at {path}.")
                return cls._create_empty_instance(None, admin_level, "internal")

            gdf = cls._map_fields(gdf, "internal", admin_level)

            if admin_level == 0:
                gdf["id"] = gdf["country_code"]
            else:
                gdf["parent_id"] = gdf["id"].apply(lambda x: x[:-3])

            boundaries = [
                AdminBoundary(**row_dict) for row_dict in gdf.to_dict("records")
            ]
            cls.logger.info(f"Created {len(boundaries)} AdminBoundary objects.")
            return cls(boundaries=boundaries, level=admin_level)

        except (FileNotFoundError, KeyError) as e:
            cls.logger.warning(
                f"No data found at {path} for admin level {admin_level}: {str(e)}"
            )
            cls.logger.info("Falling back to empty instance")
            return cls._create_empty_instance(None, admin_level, "internal")

    @classmethod
    def from_georepo(
        cls,
        country_code: str = None,
        admin_level: int = 0,
        **kwargs,
    ) -> "AdminBoundaries":
        """
        Load and create instance from GeoRepo (UNICEF) API.

        Args:
            country: Country name (if using name-based lookup)
            iso3: ISO3 code (if using code-based lookup)
            admin_level: Administrative level (0=country, 1=state, etc.)
            api_key: GeoRepo API key (optional)
            email: GeoRepo user email (optional)
            kwargs: Extra arguments (ignored)

        Returns:
            AdminBoundaries instance
        """
        cls.logger.info(
            f"Loading data from UNICEF GeoRepo for country: {country_code}, admin level: {admin_level}"
        )
        from gigaspatial.handlers.unicef_georepo import get_country_boundaries_by_iso3

        # Fetch boundaries from GeoRepo
        geojson = get_country_boundaries_by_iso3(country_code, admin_level=admin_level)

        features = geojson.get("features", [])
        boundaries = []
        parent_level = admin_level - 1

        for feat in features:
            props = feat.get("properties", {})
            geometry = feat.get("geometry")
            shapely_geom = shape(geometry) if geometry else None
            # For admin_level 0, no parent_id
            parent_id = None
            if admin_level > 0:
                parent_id = props.get(f"adm{parent_level}_ucode")

            boundary = AdminBoundary(
                id=props.get("ucode"),
                name=props.get("name"),
                name_en=props.get("name_en"),
                geometry=shapely_geom,
                parent_id=parent_id,
                country_code=country_code,
            )
            boundaries.append(boundary)

        cls.logger.info(
            f"Created {len(boundaries)} AdminBoundary objects from GeoRepo data."
        )

        # Try to infer country_code from first boundary if not set
        if boundaries and not boundaries[0].country_code:
            boundaries[0].country_code = boundaries[0].id[:3]

        return cls(boundaries=boundaries, level=admin_level)

    @classmethod
    def from_geoboundaries(cls, country_code, admin_level: int = 0):
        cls.logger.info(
            f"Searching for geoBoundaries data for country: {country_code}, admin level: {admin_level}"
        )

        country_datasets = HDXConfig.search_datasets(
            query=f'dataseries_name:"geoBoundaries - Subnational Administrative Boundaries" AND groups:"{country_code.lower()}"',
            rows=1,
        )
        if not country_datasets:
            cls.logger.error(f"No datasets found for country: {country_code}")
            raise ValueError(
                "No resources found for the specified country. Please check your search parameters and try again."
            )

        cls.logger.info(f"Found dataset: {country_datasets[0].get('title', 'Unknown')}")

        resources = [
            resource
            for resource in country_datasets[0].get_resources()
            if (
                resource.data["name"]
                == f"geoBoundaries-{country_code.upper()}-ADM{admin_level}.geojson"
            )
        ]

        if not resources:
            cls.logger.error(
                f"No resources found for {country_code} at admin level {admin_level}"
            )
            raise ValueError(
                "No resources found for the specified criteria. Please check your search parameters and try again."
            )

        cls.logger.info(f"Found resource: {resources[0].data.get('name', 'Unknown')}")

        try:
            cls.logger.info("Downloading and processing boundary data...")
            with tempfile.TemporaryDirectory() as tmpdir:
                url, local_path = resources[0].download(folder=tmpdir)
                cls.logger.debug(f"Downloaded file to temporary path: {local_path}")
                with open(local_path, "rb") as f:
                    gdf = gpd.read_file(f)

            gdf = cls._map_fields(gdf, "geoBoundaries", admin_level)
            boundaries = [
                AdminBoundary(**row_dict) for row_dict in gdf.to_dict("records")
            ]
            cls.logger.info(
                f"Successfully created {len(boundaries)} AdminBoundary objects"
            )
            return cls(boundaries=boundaries, level=admin_level)

        except (ValueError, HTTPError, FileNotFoundError) as e:
            cls.logger.warning(
                f"Error loading geoBoundaries data for {country_code} at admin level {admin_level}: {str(e)}"
            )
            cls.logger.info("Falling back to empty instance")
            return cls._create_empty_instance(
                country_code, admin_level, "geoBoundaries"
            )

    @classmethod
    def from_global_country_boundaries(cls, scale: str = "medium") -> "AdminBoundaries":
        """
        Load global country boundaries from Natural Earth Data.

        Args:
            scale (str): One of 'large', 'medium', 'small'.
                - 'large'  -> 10m
                - 'medium' -> 50m
                - 'small'  -> 110m
        Returns:
            AdminBoundaries: All country boundaries at admin_level=0
        """
        scale_map = {
            "large": "10m",
            "medium": "50m",
            "small": "110m",
        }
        if scale not in scale_map:
            raise ValueError(
                f"Invalid scale '{scale}'. Choose from 'large', 'medium', 'small'."
            )
        scale_folder = scale_map[scale]
        url = f"https://naciscdn.org/naturalearth/{scale_folder}/cultural/ne_{scale_folder}_admin_0_countries.zip"
        cls.logger.info(f"Loading Natural Earth global country boundaries from {url}")
        try:
            gdf = gpd.read_file(url)
            # Map fields to AdminBoundary schema
            boundaries = []
            for _, row in gdf.iterrows():
                iso_a3 = row.get("ISO_A3_EH") or row.get("ISO_A3") or row.get("ADM0_A3")
                name = row.get("NAME") or row.get("ADMIN") or row.get("SOVEREIGNT")
                geometry = row.get("geometry")
                if not iso_a3 or not name or geometry is None:
                    continue
                boundary = AdminBoundary(
                    id=iso_a3,
                    name=name,
                    geometry=geometry,
                    country_code=iso_a3,
                )
                boundaries.append(boundary)
            cls.logger.info(
                f"Loaded {len(boundaries)} country boundaries from Natural Earth."
            )
            return cls(boundaries=boundaries, level=0)
        except Exception as e:
            cls.logger.error(f"Failed to load Natural Earth global boundaries: {e}")
            raise

    @classmethod
    def create(
        cls,
        country_code: Optional[str] = None,
        admin_level: int = 0,
        data_store: Optional[DataStore] = None,
        path: Optional[Union[str, "Path"]] = None,
        **kwargs,
    ) -> "AdminBoundaries":
        """
        Factory method to create an AdminBoundaries instance using various data sources,
        depending on the provided parameters and global configuration.

        Loading Logic:
            1. If a `data_store` is provided and either a `path` is given or
               `global_config.ADMIN_BOUNDARIES_DATA_DIR` is set:
                - If `path` is not provided but `country_code` is, the path is constructed
                  using `global_config.get_admin_path()`.
                - Loads boundaries from the specified data store and path.

            2. If only `country_code` is provided (no data_store):
                - Attempts to load boundaries from GeoRepo (if available).
                - If GeoRepo is unavailable, attempts to load from GADM.
                - If GADM fails, falls back to geoBoundaries.
                - Raises an error if all sources fail.

            3. If neither `country_code` nor `data_store` is provided:
                - Raises a ValueError.

        Args:
            country_code (Optional[str]): ISO country code (2 or 3 letter) or country name.
            admin_level (int): Administrative level (0=country, 1=state/province, etc.).
            data_store (Optional[DataStore]): Optional data store instance for loading from existing data.
            path (Optional[Union[str, Path]]): Optional path to data file (used with data_store).
            **kwargs: Additional arguments passed to the underlying creation methods.

        Returns:
            AdminBoundaries: Configured instance.

        Raises:
            ValueError: If neither country_code nor (data_store, path) are provided,
                        or if country_code lookup fails.
            RuntimeError: If all data sources fail to load boundaries.

        Examples:
            # Load from a data store (path auto-generated if not provided)
            boundaries = AdminBoundaries.create(country_code="USA", admin_level=1, data_store=store)

            # Load from a specific file in a data store
            boundaries = AdminBoundaries.create(data_store=store, path="data.shp")

            # Load from online sources (GeoRepo, GADM, geoBoundaries)
            boundaries = AdminBoundaries.create(country_code="USA", admin_level=1)
        """
        cls.logger.info(
            f"Creating AdminBoundaries instance. Country: {country_code}, "
            f"admin level: {admin_level}, data_store provided: {data_store is not None}, "
            f"path provided: {path is not None}"
        )

        from_data_store = data_store is not None and (
            global_config.ADMIN_BOUNDARIES_DATA_DIR is not None or path is not None
        )

        # Validate input parameters
        if not country_code and not data_store:
            raise ValueError("Either country_code or data_store must be provided.")

        if from_data_store and not path and not country_code:
            raise ValueError(
                "If data_store is provided, either path or country_code must also be specified."
            )

        # Handle data store path first
        if from_data_store:
            iso3_code = None
            if country_code:
                try:
                    iso3_code = pycountry.countries.lookup(country_code).alpha_3
                except LookupError as e:
                    raise ValueError(f"Invalid country code '{country_code}': {e}")

            # Generate path if not provided
            if path is None and iso3_code:
                path = global_config.get_admin_path(
                    country_code=iso3_code,
                    admin_level=admin_level,
                )

            return cls.from_data_store(data_store, path, admin_level, **kwargs)

        # Handle country code path
        if country_code is not None:
            try:
                iso3_code = pycountry.countries.lookup(country_code).alpha_3
            except LookupError as e:
                raise ValueError(f"Invalid country code '{country_code}': {e}")

            # Try GeoRepo first
            if cls._try_georepo(iso3_code, admin_level):
                return cls.from_georepo(iso3_code, admin_level=admin_level)

            # Fallback to GADM
            try:
                cls.logger.info("Attempting to load from GADM.")
                return cls.from_gadm(iso3_code, admin_level, **kwargs)
            except Exception as e:
                cls.logger.warning(
                    f"GADM loading failed: {e}. Falling back to geoBoundaries."
                )

            # Final fallback to geoBoundaries
            try:
                return cls.from_geoboundaries(iso3_code, admin_level)
            except Exception as e:
                cls.logger.error(f"All data sources failed. geoBoundaries error: {e}")
                raise RuntimeError(
                    f"Failed to load administrative boundaries for {country_code} "
                    f"from all available sources (GeoRepo, GADM, geoBoundaries)."
                ) from e

        # This should never be reached due to validation above
        raise ValueError("Unexpected error: no valid data source could be determined.")

    @classmethod
    def _try_georepo(cls, iso3_code: str, admin_level: int) -> bool:
        """Helper method to test GeoRepo availability.

        Args:
            iso3_code: ISO3 country code
            admin_level: Administrative level

        Returns:
            bool: True if GeoRepo is available and working, False otherwise
        """
        try:
            from gigaspatial.handlers.unicef_georepo import GeoRepoClient

            client = GeoRepoClient()
            if client.check_connection():
                cls.logger.info("GeoRepo connection successful.")
                return True
            else:
                cls.logger.info("GeoRepo connection failed.")
                return False

        except ImportError:
            cls.logger.info("GeoRepo client not available (import failed).")
            return False
        except ValueError as e:
            cls.logger.warning(f"GeoRepo initialization failed: {e}")
            return False
        except Exception as e:
            cls.logger.warning(f"GeoRepo error: {e}")
            return False

    @classmethod
    def _create_empty_instance(
        cls, country_code: Optional[str], admin_level: int, source_type: str
    ) -> "AdminBoundaries":
        """Create an empty instance with the required schema structure."""
        # for to_geodataframe() to use later
        instance = cls(boundaries=[], level=admin_level, country_code=country_code)

        schema_fields = set(cls.get_schema_config()[source_type].keys())
        schema_fields.update(["geometry", "country_code", "id", "name", "name_en"])
        if admin_level > 0:
            schema_fields.add("parent_id")

        instance._empty_schema = list(schema_fields)
        return instance

    @classmethod
    def _map_fields(
        cls,
        gdf: gpd.GeoDataFrame,
        source: str,
        current_level: int,
    ) -> gpd.GeoDataFrame:
        """Map source fields to schema fields"""
        config = cls.get_schema_config().get(source, {})
        parent_level = current_level - 1

        field_mapping = {}
        for k, v in config.items():
            if "{parent_level}" in v:
                field_mapping[v.format(parent_level=parent_level)] = k
            elif "{level}" in v:
                field_mapping[v.format(level=current_level)] = k
            else:
                field_mapping[v] = k

        return gdf.rename(columns=field_mapping)
create(country_code=None, admin_level=0, data_store=None, path=None, **kwargs) classmethod

Factory method to create an AdminBoundaries instance using various data sources, depending on the provided parameters and global configuration.

Loading Logic
  1. If a data_store is provided and either a path is given or global_config.ADMIN_BOUNDARIES_DATA_DIR is set:

    • If path is not provided but country_code is, the path is constructed using global_config.get_admin_path().
    • Loads boundaries from the specified data store and path.
  2. If only country_code is provided (no data_store):

    • Attempts to load boundaries from GeoRepo (if available).
    • If GeoRepo is unavailable, attempts to load from GADM.
    • If GADM fails, falls back to geoBoundaries.
    • Raises an error if all sources fail.
  3. If neither country_code nor data_store is provided:

    • Raises a ValueError.

Parameters:

Name Type Description Default
country_code Optional[str]

ISO country code (2 or 3 letter) or country name.

None
admin_level int

Administrative level (0=country, 1=state/province, etc.).

0
data_store Optional[DataStore]

Optional data store instance for loading from existing data.

None
path Optional[Union[str, Path]]

Optional path to data file (used with data_store).

None
**kwargs

Additional arguments passed to the underlying creation methods.

{}

Returns:

Name Type Description
AdminBoundaries AdminBoundaries

Configured instance.

Raises:

Type Description
ValueError

If neither country_code nor (data_store, path) are provided, or if country_code lookup fails.

RuntimeError

If all data sources fail to load boundaries.

Examples:

Load from a data store (path auto-generated if not provided)

boundaries = AdminBoundaries.create(country_code="USA", admin_level=1, data_store=store)

Load from a specific file in a data store

boundaries = AdminBoundaries.create(data_store=store, path="data.shp")

Load from online sources (GeoRepo, GADM, geoBoundaries)

boundaries = AdminBoundaries.create(country_code="USA", admin_level=1)

Source code in gigaspatial/handlers/boundaries.py
@classmethod
def create(
    cls,
    country_code: Optional[str] = None,
    admin_level: int = 0,
    data_store: Optional[DataStore] = None,
    path: Optional[Union[str, "Path"]] = None,
    **kwargs,
) -> "AdminBoundaries":
    """
    Factory method to create an AdminBoundaries instance using various data sources,
    depending on the provided parameters and global configuration.

    Loading Logic:
        1. If a `data_store` is provided and either a `path` is given or
           `global_config.ADMIN_BOUNDARIES_DATA_DIR` is set:
            - If `path` is not provided but `country_code` is, the path is constructed
              using `global_config.get_admin_path()`.
            - Loads boundaries from the specified data store and path.

        2. If only `country_code` is provided (no data_store):
            - Attempts to load boundaries from GeoRepo (if available).
            - If GeoRepo is unavailable, attempts to load from GADM.
            - If GADM fails, falls back to geoBoundaries.
            - Raises an error if all sources fail.

        3. If neither `country_code` nor `data_store` is provided:
            - Raises a ValueError.

    Args:
        country_code (Optional[str]): ISO country code (2 or 3 letter) or country name.
        admin_level (int): Administrative level (0=country, 1=state/province, etc.).
        data_store (Optional[DataStore]): Optional data store instance for loading from existing data.
        path (Optional[Union[str, Path]]): Optional path to data file (used with data_store).
        **kwargs: Additional arguments passed to the underlying creation methods.

    Returns:
        AdminBoundaries: Configured instance.

    Raises:
        ValueError: If neither country_code nor (data_store, path) are provided,
                    or if country_code lookup fails.
        RuntimeError: If all data sources fail to load boundaries.

    Examples:
        # Load from a data store (path auto-generated if not provided)
        boundaries = AdminBoundaries.create(country_code="USA", admin_level=1, data_store=store)

        # Load from a specific file in a data store
        boundaries = AdminBoundaries.create(data_store=store, path="data.shp")

        # Load from online sources (GeoRepo, GADM, geoBoundaries)
        boundaries = AdminBoundaries.create(country_code="USA", admin_level=1)
    """
    cls.logger.info(
        f"Creating AdminBoundaries instance. Country: {country_code}, "
        f"admin level: {admin_level}, data_store provided: {data_store is not None}, "
        f"path provided: {path is not None}"
    )

    from_data_store = data_store is not None and (
        global_config.ADMIN_BOUNDARIES_DATA_DIR is not None or path is not None
    )

    # Validate input parameters
    if not country_code and not data_store:
        raise ValueError("Either country_code or data_store must be provided.")

    if from_data_store and not path and not country_code:
        raise ValueError(
            "If data_store is provided, either path or country_code must also be specified."
        )

    # Handle data store path first
    if from_data_store:
        iso3_code = None
        if country_code:
            try:
                iso3_code = pycountry.countries.lookup(country_code).alpha_3
            except LookupError as e:
                raise ValueError(f"Invalid country code '{country_code}': {e}")

        # Generate path if not provided
        if path is None and iso3_code:
            path = global_config.get_admin_path(
                country_code=iso3_code,
                admin_level=admin_level,
            )

        return cls.from_data_store(data_store, path, admin_level, **kwargs)

    # Handle country code path
    if country_code is not None:
        try:
            iso3_code = pycountry.countries.lookup(country_code).alpha_3
        except LookupError as e:
            raise ValueError(f"Invalid country code '{country_code}': {e}")

        # Try GeoRepo first
        if cls._try_georepo(iso3_code, admin_level):
            return cls.from_georepo(iso3_code, admin_level=admin_level)

        # Fallback to GADM
        try:
            cls.logger.info("Attempting to load from GADM.")
            return cls.from_gadm(iso3_code, admin_level, **kwargs)
        except Exception as e:
            cls.logger.warning(
                f"GADM loading failed: {e}. Falling back to geoBoundaries."
            )

        # Final fallback to geoBoundaries
        try:
            return cls.from_geoboundaries(iso3_code, admin_level)
        except Exception as e:
            cls.logger.error(f"All data sources failed. geoBoundaries error: {e}")
            raise RuntimeError(
                f"Failed to load administrative boundaries for {country_code} "
                f"from all available sources (GeoRepo, GADM, geoBoundaries)."
            ) from e

    # This should never be reached due to validation above
    raise ValueError("Unexpected error: no valid data source could be determined.")
from_data_store(data_store, path, admin_level=0, **kwargs) classmethod

Load and create instance from internal data store.

Source code in gigaspatial/handlers/boundaries.py
@classmethod
def from_data_store(
    cls,
    data_store: DataStore,
    path: Union[str, "Path"],
    admin_level: int = 0,
    **kwargs,
) -> "AdminBoundaries":
    """Load and create instance from internal data store."""
    cls.logger.info(
        f"Loading data from data store at path: {path}, admin level: {admin_level}"
    )
    try:
        gdf = read_dataset(data_store, str(path), **kwargs)

        if gdf.empty:
            cls.logger.warning(f"No data found at {path}.")
            return cls._create_empty_instance(None, admin_level, "internal")

        gdf = cls._map_fields(gdf, "internal", admin_level)

        if admin_level == 0:
            gdf["id"] = gdf["country_code"]
        else:
            gdf["parent_id"] = gdf["id"].apply(lambda x: x[:-3])

        boundaries = [
            AdminBoundary(**row_dict) for row_dict in gdf.to_dict("records")
        ]
        cls.logger.info(f"Created {len(boundaries)} AdminBoundary objects.")
        return cls(boundaries=boundaries, level=admin_level)

    except (FileNotFoundError, KeyError) as e:
        cls.logger.warning(
            f"No data found at {path} for admin level {admin_level}: {str(e)}"
        )
        cls.logger.info("Falling back to empty instance")
        return cls._create_empty_instance(None, admin_level, "internal")
from_gadm(country_code, admin_level=0, **kwargs) classmethod

Load and create instance from GADM data.

Source code in gigaspatial/handlers/boundaries.py
@classmethod
def from_gadm(
    cls, country_code: str, admin_level: int = 0, **kwargs
) -> "AdminBoundaries":
    """Load and create instance from GADM data."""
    url = f"https://geodata.ucdavis.edu/gadm/gadm4.1/json/gadm41_{country_code}_{admin_level}.json"
    cls.logger.info(
        f"Loading GADM data for country: {country_code}, admin level: {admin_level} from URL: {url}"
    )
    try:
        gdf = gpd.read_file(url)

        gdf = cls._map_fields(gdf, "gadm", admin_level)

        if admin_level == 0:
            gdf["country_code"] = gdf["id"]
            gdf["name"] = gdf["COUNTRY"]
        elif admin_level == 1:
            gdf["country_code"] = gdf["parent_id"]

        boundaries = [
            AdminBoundary(**row_dict) for row_dict in gdf.to_dict("records")
        ]
        cls.logger.info(f"Created {len(boundaries)} AdminBoundary objects.")
        return cls(
            boundaries=boundaries, level=admin_level, country_code=country_code
        )

    except (ValueError, HTTPError, FileNotFoundError) as e:
        cls.logger.warning(
            f"Error loading GADM data for {country_code} at admin level {admin_level}: {str(e)}"
        )
        cls.logger.info("Falling back to empty instance")
        return cls._create_empty_instance(country_code, admin_level, "gadm")
from_georepo(country_code=None, admin_level=0, **kwargs) classmethod

Load and create instance from GeoRepo (UNICEF) API.

Parameters:

Name Type Description Default
country

Country name (if using name-based lookup)

required
iso3

ISO3 code (if using code-based lookup)

required
admin_level int

Administrative level (0=country, 1=state, etc.)

0
api_key

GeoRepo API key (optional)

required
email

GeoRepo user email (optional)

required
kwargs

Extra arguments (ignored)

{}

Returns:

Type Description
AdminBoundaries

AdminBoundaries instance

Source code in gigaspatial/handlers/boundaries.py
@classmethod
def from_georepo(
    cls,
    country_code: str = None,
    admin_level: int = 0,
    **kwargs,
) -> "AdminBoundaries":
    """
    Load and create instance from GeoRepo (UNICEF) API.

    Args:
        country: Country name (if using name-based lookup)
        iso3: ISO3 code (if using code-based lookup)
        admin_level: Administrative level (0=country, 1=state, etc.)
        api_key: GeoRepo API key (optional)
        email: GeoRepo user email (optional)
        kwargs: Extra arguments (ignored)

    Returns:
        AdminBoundaries instance
    """
    cls.logger.info(
        f"Loading data from UNICEF GeoRepo for country: {country_code}, admin level: {admin_level}"
    )
    from gigaspatial.handlers.unicef_georepo import get_country_boundaries_by_iso3

    # Fetch boundaries from GeoRepo
    geojson = get_country_boundaries_by_iso3(country_code, admin_level=admin_level)

    features = geojson.get("features", [])
    boundaries = []
    parent_level = admin_level - 1

    for feat in features:
        props = feat.get("properties", {})
        geometry = feat.get("geometry")
        shapely_geom = shape(geometry) if geometry else None
        # For admin_level 0, no parent_id
        parent_id = None
        if admin_level > 0:
            parent_id = props.get(f"adm{parent_level}_ucode")

        boundary = AdminBoundary(
            id=props.get("ucode"),
            name=props.get("name"),
            name_en=props.get("name_en"),
            geometry=shapely_geom,
            parent_id=parent_id,
            country_code=country_code,
        )
        boundaries.append(boundary)

    cls.logger.info(
        f"Created {len(boundaries)} AdminBoundary objects from GeoRepo data."
    )

    # Try to infer country_code from first boundary if not set
    if boundaries and not boundaries[0].country_code:
        boundaries[0].country_code = boundaries[0].id[:3]

    return cls(boundaries=boundaries, level=admin_level)
from_global_country_boundaries(scale='medium') classmethod

Load global country boundaries from Natural Earth Data.

Parameters:

Name Type Description Default
scale str

One of 'large', 'medium', 'small'. - 'large' -> 10m - 'medium' -> 50m - 'small' -> 110m

'medium'
Source code in gigaspatial/handlers/boundaries.py
@classmethod
def from_global_country_boundaries(cls, scale: str = "medium") -> "AdminBoundaries":
    """
    Load global country boundaries from Natural Earth Data.

    Args:
        scale (str): One of 'large', 'medium', 'small'.
            - 'large'  -> 10m
            - 'medium' -> 50m
            - 'small'  -> 110m
    Returns:
        AdminBoundaries: All country boundaries at admin_level=0
    """
    scale_map = {
        "large": "10m",
        "medium": "50m",
        "small": "110m",
    }
    if scale not in scale_map:
        raise ValueError(
            f"Invalid scale '{scale}'. Choose from 'large', 'medium', 'small'."
        )
    scale_folder = scale_map[scale]
    url = f"https://naciscdn.org/naturalearth/{scale_folder}/cultural/ne_{scale_folder}_admin_0_countries.zip"
    cls.logger.info(f"Loading Natural Earth global country boundaries from {url}")
    try:
        gdf = gpd.read_file(url)
        # Map fields to AdminBoundary schema
        boundaries = []
        for _, row in gdf.iterrows():
            iso_a3 = row.get("ISO_A3_EH") or row.get("ISO_A3") or row.get("ADM0_A3")
            name = row.get("NAME") or row.get("ADMIN") or row.get("SOVEREIGNT")
            geometry = row.get("geometry")
            if not iso_a3 or not name or geometry is None:
                continue
            boundary = AdminBoundary(
                id=iso_a3,
                name=name,
                geometry=geometry,
                country_code=iso_a3,
            )
            boundaries.append(boundary)
        cls.logger.info(
            f"Loaded {len(boundaries)} country boundaries from Natural Earth."
        )
        return cls(boundaries=boundaries, level=0)
    except Exception as e:
        cls.logger.error(f"Failed to load Natural Earth global boundaries: {e}")
        raise
get_schema_config() classmethod

Return field mappings for different data sources

Source code in gigaspatial/handlers/boundaries.py
@classmethod
def get_schema_config(cls) -> Dict[str, Dict[str, str]]:
    """Return field mappings for different data sources"""
    return cls._schema_config
to_geodataframe()

Convert the AdminBoundaries to a GeoDataFrame.

Source code in gigaspatial/handlers/boundaries.py
def to_geodataframe(self) -> gpd.GeoDataFrame:
    """Convert the AdminBoundaries to a GeoDataFrame."""
    if not self.boundaries:
        if hasattr(self, "_empty_schema"):
            columns = self._empty_schema
        else:
            columns = ["id", "name", "country_code", "geometry"]
            if self.level > 0:
                columns.append("parent_id")

        return gpd.GeoDataFrame(columns=columns, geometry="geometry", crs=4326)

    return gpd.GeoDataFrame(
        [boundary.model_dump() for boundary in self.boundaries],
        geometry="geometry",
        crs=4326,
    )

AdminBoundary

Bases: BaseModel

Base class for administrative boundary data with flexible fields.

Source code in gigaspatial/handlers/boundaries.py
class AdminBoundary(BaseModel):
    """Base class for administrative boundary data with flexible fields."""

    id: str = Field(..., description="Unique identifier for the administrative unit")
    name: str = Field(..., description="Primary local name")
    geometry: Union[Polygon, MultiPolygon] = Field(
        ..., description="Geometry of the administrative boundary"
    )

    name_en: Optional[str] = Field(
        None, description="English name if different from local name"
    )
    parent_id: Optional[str] = Field(
        None, description="ID of parent administrative unit"
    )
    country_code: Optional[str] = Field(
        None, min_length=3, max_length=3, description="ISO 3166-1 alpha-3 country code"
    )

    class Config:
        arbitrary_types_allowed = True

ghsl

CoordSystem

Bases: int, Enum

Enum for coordinate systems used by GHSL datasets.

Source code in gigaspatial/handlers/ghsl.py
class CoordSystem(int, Enum):
    """Enum for coordinate systems used by GHSL datasets."""

    WGS84 = 4326
    Mollweide = 54009

GHSLDataConfig dataclass

Bases: BaseHandlerConfig

Source code in gigaspatial/handlers/ghsl.py
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class GHSLDataConfig(BaseHandlerConfig):
    # constants
    AVAILABLE_YEARS: List = Field(default=np.append(np.arange(1975, 2031, 5), 2018))
    AVAILABLE_RESOLUTIONS: List = Field(default=[10, 100, 1000])

    # base config
    GHSL_DB_BASE_URL: HttpUrl = Field(
        default="https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/GHSL/"
    )
    TILES_URL: str = "https://ghsl.jrc.ec.europa.eu/download/GHSL_data_{}_shapefile.zip"

    # user config
    base_path: Path = Field(default=global_config.get_path("ghsl", "bronze"))
    coord_system: CoordSystem = CoordSystem.WGS84
    release: str = "R2023A"

    product: Literal[
        "GHS_BUILT_S",
        "GHS_BUILT_H_AGBH",
        "GHS_BUILT_H_ANBH",
        "GHS_BUILT_V",
        "GHS_POP",
        "GHS_SMOD",
    ] = Field(...)
    year: int = 2020
    resolution: int = 100

    def __post_init__(self):
        super().__post_init__()

    def _load_tiles(self):
        """Load GHSL tiles from tiles shapefile."""
        # Check cache first
        cache_dir = self.base_path / "cache"
        tiles_cache = cache_dir / f"tiles_{self.coord_system.value}.geojson"

        if self.data_store.file_exists(tiles_cache):

            try:
                self.tiles_gdf = gpd.read_file(tiles_cache)
                self.logger.info(f"Loaded tiles from cache")
                return
            except Exception:
                self.logger.warning("Cache invalid, re-downloading")

        import shutil
        import ssl
        import warnings
        from gigaspatial.core.io.writers import write_dataset

        temp_dir = None

        try:
            # Try with SSL verification first
            try:
                self.logger.info("Attempting download GHSL tile grid with SSL verification")
                # Set up SSL context before any requests
                ssl._create_default_https_context = ssl._create_unverified_context
                self.tiles_gdf = gpd.read_file(self.TILES_URL)
                self.logger.info("Download successful with SSL verification")

            except Exception as e:
                # Fall back to requests with SSL verification
                self.logger.warning(
                    f"Geopandas download with SSL verification failed ({type(e).__name__}): {e}"
                )
                self.logger.warning(
                    "Retrying GHSL tile grid download with requests and without SSL verification"
                )

                # Download tiles with requests
                temp_dir = Path(tempfile.mkdtemp())
                zip_path = temp_dir / "tiles.zip"

                # Suppress SSL warnings when using verify=False
                from urllib3.exceptions import InsecureRequestWarning

                warnings.filterwarnings("ignore", category=InsecureRequestWarning)

                response = requests.get(self.TILES_URL, verify=False)
                response.raise_for_status()
                self.logger.info("Download successful without SSL verification")

                # Write downloaded content to file
                with open(zip_path, "wb") as f:
                    f.write(response.content)

                # Read the shapefile from the zip
                self.tiles_gdf = gpd.read_file(zip_path)

            # Cache it
            write_dataset(self.tiles_gdf, self.data_store, str(tiles_cache))
            self.logger.info("Tiles cached successfully")

        except requests.exceptions.RequestException as e:
            self.logger.error(f"Failed to download tiles: {e}")
            raise ValueError(
                f"Could not download GHSL tiles from {self.TILES_URL}"
            ) from e
        finally:
            # Cleanup temp directory if it was created
            if temp_dir is not None and temp_dir.exists():
                shutil.rmtree(temp_dir)

    @field_validator("year")
    def validate_year(cls, value: str) -> int:
        if value in cls.AVAILABLE_YEARS:
            return value
        raise ValueError(
            f"No datasets found for the provided year: {value}\nAvailable years are: {cls.AVAILABLE_YEARS}"
        )

    @field_validator("resolution")
    def validate_resolution(cls, value: str) -> int:
        if value in cls.AVAILABLE_RESOLUTIONS:
            return value
        raise ValueError(
            f"No datasets found for the provided resolution: {value}\nAvailable resolutions are: {cls.AVAILABLE_RESOLUTIONS}"
        )

    @model_validator(mode="after")
    def validate_configuration(self):
        """
        Validate that the configuration is valid based on dataset availability constraints.

        Specific rules:
        -
        """
        if self.year == 2018 and self.product in ["GHS_BUILT_V", "GHS_POP", "GHS_SMOD"]:
            raise ValueError(f"{self.product} product is not available for 2018")

        if self.resolution == 10 and self.product != "GHS_BUILT_H":
            raise ValueError(
                f"{self.product} product is not available at 10 (10m) resolution"
            )

        if "GHS_BUILT_H" in self.product:
            if self.year != 2018:
                self.logger.warning(
                    "Building height product is only available for 2018, year is set as 2018"
                )
                self.year = 2018

        if self.product == "GHS_BUILT_S":
            if self.year == 2018 and self.resolution != 10:
                self.logger.warning(
                    "Built-up surface product for 2018 is only available at 10m resolution, resolution is set as 10m"
                )
                self.resolution = 10

            if self.resolution == 10 and self.year != 2018:
                self.logger.warning(
                    "Built-up surface product at resolution 10 is only available for 2018, year is set as 2018"
                )
                self.year = 2018

            if self.resolution == 10 and self.coord_system != CoordSystem.Mollweide:
                self.logger.warning(
                    f"Built-up surface product at resolution 10 is only available with Mollweide ({CoordSystem.Mollweide}) projection, coordinate system is set as Mollweide"
                )
                self.coord_system = CoordSystem.Mollweide

        if self.product == "GHS_SMOD":
            if self.resolution != 1000:
                self.logger.warning(
                    f"Settlement model (SMOD) product is only available at 1000 (1km) resolution, resolution is set as 1000"
                )
                self.resolution = 1000

            if self.coord_system != CoordSystem.Mollweide:
                self.logger.warning(
                    f"Settlement model (SMOD) product is only available with Mollweide ({CoordSystem.Mollweide}) projection, coordinate system is set as Mollweide"
                )
                self.coord_system = CoordSystem.Mollweide

        self.TILES_URL = self.TILES_URL.format(self.coord_system.value)
        self._load_tiles()

        return self

    @property
    def crs(self) -> str:
        return "EPSG:4326" if self.coord_system == CoordSystem.WGS84 else "ESRI:54009"

    def get_relevant_data_units_by_geometry(
        self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
    ) -> List[dict]:
        """
        Return intersecting tiles for a given geometry or GeoDataFrame.
        """
        crs = kwargs.get("crs", "EPSG:4326")  # assume WGS84 4326 if no crs passed
        if self.tiles_gdf.crs != crs:
            geometry = (
                gpd.GeoDataFrame(geometry=[geometry], crs=crs)
                .to_crs(self.tiles_gdf.crs)
                .geometry[0]
            )

        # Find intersecting tiles
        mask = (tile_geom.intersects(geometry) for tile_geom in self.tiles_gdf.geometry)

        intersecting_tiles = self.tiles_gdf.loc[mask, "tile_id"].to_list()

        return intersecting_tiles

    def get_data_unit_path(self, unit: str = None, file_ext=".zip", **kwargs) -> Path:
        """Construct and return the path for the configured dataset or dataset tile."""
        info = self._get_product_info()

        tile_path = (
            self.base_path
            / info["product_folder"]
            / (
                f"{info['product_name']}_V{info['product_version']}_0"
                + (f"_{unit}" if unit else "")
                + file_ext
            )
        )

        return tile_path

    def extract_search_geometry(self, source, **kwargs):
        source_crs = kwargs.pop("crs", None)
        if not source_crs:
            source_crs = "EPSG:4326" if self.coord_system==4326 else "ESRI:54009"
        return super().extract_search_geometry(source, crs=source_crs, **kwargs)

    def compute_dataset_url(self, tile_id=None) -> str:
        """Compute the download URL for a GHSL dataset."""
        info = self._get_product_info()

        path_segments = [
            str(self.GHSL_DB_BASE_URL),
            info["product_folder"],
            info["product_name"],
            f"V{info['product_version']}-0",
            "tiles" if tile_id else "",
            f"{info['product_name']}_V{info['product_version']}_0"
            + (f"_{tile_id}" if tile_id else "")
            + ".zip",
        ]

        return "/".join(path_segments)

    def _get_product_info(self) -> dict:
        """Generate and return common product information used in multiple methods."""
        resolution_str = (
            str(self.resolution)
            if self.coord_system == CoordSystem.Mollweide
            else ("3ss" if self.resolution == 100 else "30ss")
        )
        product_folder = f"{self.product}_GLOBE_{self.release}"
        product_name = f"{self.product}_E{self.year}_GLOBE_{self.release}_{self.coord_system.value}_{resolution_str}"
        product_version = 2 if self.product == "GHS_SMOD" else 1

        return {
            "resolution_str": resolution_str,
            "product_folder": product_folder,
            "product_name": product_name,
            "product_version": product_version,
        }

    def __repr__(self) -> str:
        """Return a string representation of the GHSL dataset configuration."""
        return (
            f"GHSLDataConfig("
            f"product='{self.product}', "
            f"year={self.year}, "
            f"resolution={self.resolution}, "
            f"coord_system={self.coord_system.name}, "
            f"release='{self.release}'"
            f")"
        )
__repr__()

Return a string representation of the GHSL dataset configuration.

Source code in gigaspatial/handlers/ghsl.py
def __repr__(self) -> str:
    """Return a string representation of the GHSL dataset configuration."""
    return (
        f"GHSLDataConfig("
        f"product='{self.product}', "
        f"year={self.year}, "
        f"resolution={self.resolution}, "
        f"coord_system={self.coord_system.name}, "
        f"release='{self.release}'"
        f")"
    )
compute_dataset_url(tile_id=None)

Compute the download URL for a GHSL dataset.

Source code in gigaspatial/handlers/ghsl.py
def compute_dataset_url(self, tile_id=None) -> str:
    """Compute the download URL for a GHSL dataset."""
    info = self._get_product_info()

    path_segments = [
        str(self.GHSL_DB_BASE_URL),
        info["product_folder"],
        info["product_name"],
        f"V{info['product_version']}-0",
        "tiles" if tile_id else "",
        f"{info['product_name']}_V{info['product_version']}_0"
        + (f"_{tile_id}" if tile_id else "")
        + ".zip",
    ]

    return "/".join(path_segments)
get_data_unit_path(unit=None, file_ext='.zip', **kwargs)

Construct and return the path for the configured dataset or dataset tile.

Source code in gigaspatial/handlers/ghsl.py
def get_data_unit_path(self, unit: str = None, file_ext=".zip", **kwargs) -> Path:
    """Construct and return the path for the configured dataset or dataset tile."""
    info = self._get_product_info()

    tile_path = (
        self.base_path
        / info["product_folder"]
        / (
            f"{info['product_name']}_V{info['product_version']}_0"
            + (f"_{unit}" if unit else "")
            + file_ext
        )
    )

    return tile_path
get_relevant_data_units_by_geometry(geometry, **kwargs)

Return intersecting tiles for a given geometry or GeoDataFrame.

Source code in gigaspatial/handlers/ghsl.py
def get_relevant_data_units_by_geometry(
    self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
) -> List[dict]:
    """
    Return intersecting tiles for a given geometry or GeoDataFrame.
    """
    crs = kwargs.get("crs", "EPSG:4326")  # assume WGS84 4326 if no crs passed
    if self.tiles_gdf.crs != crs:
        geometry = (
            gpd.GeoDataFrame(geometry=[geometry], crs=crs)
            .to_crs(self.tiles_gdf.crs)
            .geometry[0]
        )

    # Find intersecting tiles
    mask = (tile_geom.intersects(geometry) for tile_geom in self.tiles_gdf.geometry)

    intersecting_tiles = self.tiles_gdf.loc[mask, "tile_id"].to_list()

    return intersecting_tiles
validate_configuration()

Validate that the configuration is valid based on dataset availability constraints.

Specific rules:
Source code in gigaspatial/handlers/ghsl.py
@model_validator(mode="after")
def validate_configuration(self):
    """
    Validate that the configuration is valid based on dataset availability constraints.

    Specific rules:
    -
    """
    if self.year == 2018 and self.product in ["GHS_BUILT_V", "GHS_POP", "GHS_SMOD"]:
        raise ValueError(f"{self.product} product is not available for 2018")

    if self.resolution == 10 and self.product != "GHS_BUILT_H":
        raise ValueError(
            f"{self.product} product is not available at 10 (10m) resolution"
        )

    if "GHS_BUILT_H" in self.product:
        if self.year != 2018:
            self.logger.warning(
                "Building height product is only available for 2018, year is set as 2018"
            )
            self.year = 2018

    if self.product == "GHS_BUILT_S":
        if self.year == 2018 and self.resolution != 10:
            self.logger.warning(
                "Built-up surface product for 2018 is only available at 10m resolution, resolution is set as 10m"
            )
            self.resolution = 10

        if self.resolution == 10 and self.year != 2018:
            self.logger.warning(
                "Built-up surface product at resolution 10 is only available for 2018, year is set as 2018"
            )
            self.year = 2018

        if self.resolution == 10 and self.coord_system != CoordSystem.Mollweide:
            self.logger.warning(
                f"Built-up surface product at resolution 10 is only available with Mollweide ({CoordSystem.Mollweide}) projection, coordinate system is set as Mollweide"
            )
            self.coord_system = CoordSystem.Mollweide

    if self.product == "GHS_SMOD":
        if self.resolution != 1000:
            self.logger.warning(
                f"Settlement model (SMOD) product is only available at 1000 (1km) resolution, resolution is set as 1000"
            )
            self.resolution = 1000

        if self.coord_system != CoordSystem.Mollweide:
            self.logger.warning(
                f"Settlement model (SMOD) product is only available with Mollweide ({CoordSystem.Mollweide}) projection, coordinate system is set as Mollweide"
            )
            self.coord_system = CoordSystem.Mollweide

    self.TILES_URL = self.TILES_URL.format(self.coord_system.value)
    self._load_tiles()

    return self

GHSLDataDownloader

Bases: BaseHandlerDownloader

A class to handle downloads of GHSL datasets.

Source code in gigaspatial/handlers/ghsl.py
class GHSLDataDownloader(BaseHandlerDownloader):
    """A class to handle downloads of GHSL datasets."""

    def __init__(
        self,
        config: Union[GHSLDataConfig, dict[str, Union[str, int]]],
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        """
        Initialize the downloader.

        Args:
            config: Configuration for the GHSL dataset, either as a GHSLDataConfig object or a dictionary of parameters
            data_store: Optional data storage interface. If not provided, uses LocalDataStore.
            logger: Optional custom logger. If not provided, uses default logger.
        """
        config = (
            config if isinstance(config, GHSLDataConfig) else GHSLDataConfig(**config)
        )
        super().__init__(config=config, data_store=data_store, logger=logger)

    def download_data_unit(
        self,
        tile_id: str,
        extract: bool = True,
        file_pattern: Optional[str] = r".*\.tif$",
        **kwargs,
    ) -> Optional[Union[Path, List[Path]]]:
        """
        Downloads and optionally extracts files for a given tile.

        Args:
            tile_id: tile ID to process.
            extract: If True and the downloaded file is a zip, extract its contents. Defaults to True.
            file_pattern: Optional regex pattern to filter extracted files (if extract=True).
            **kwargs: Additional parameters passed to download methods

        Returns:
            Path to the downloaded file if extract=False,
            List of paths to the extracted files if extract=True,
            None on failure.
        """
        url = self.config.compute_dataset_url(tile_id=tile_id)
        output_path = self.config.get_data_unit_path(tile_id)

        if not extract:
            return self._download_file(url, output_path)

        extracted_files: List[Path] = []
        temp_downloaded_path: Optional[Path] = None

        try:
            with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as temp_file:
                temp_downloaded_path = Path(temp_file.name)
                self.logger.debug(
                    f"Downloading {url} to temporary file: {temp_downloaded_path}"
                )

                response = requests.get(url, stream=True)
                response.raise_for_status()

                total_size = int(response.headers.get("content-length", 0))

                with tqdm(
                    total=total_size,
                    unit="B",
                    unit_scale=True,
                    desc=f"Downloading {tile_id}",
                ) as pbar:
                    for chunk in response.iter_content(chunk_size=8192):
                        if chunk:
                            temp_file.write(chunk)
                            pbar.update(len(chunk))

            self.logger.info(f"Successfully downloaded temporary file!")

            with zipfile.ZipFile(str(temp_downloaded_path), "r") as zip_ref:
                if file_pattern:
                    import re

                    pattern = re.compile(file_pattern)
                    files_to_extract = [
                        f for f in zip_ref.namelist() if pattern.match(f)
                    ]
                else:
                    files_to_extract = zip_ref.namelist()

                for file in files_to_extract:
                    extracted_path = output_path.parent / Path(file).name
                    with zip_ref.open(file) as source:
                        file_content = source.read()
                        self.data_store.write_file(str(extracted_path), file_content)
                    extracted_files.append(extracted_path)
                    self.logger.info(f"Extracted {file} to {extracted_path}")

            Path(temp_file.name).unlink()
            return extracted_files

        except requests.exceptions.RequestException as e:
            self.logger.error(f"Failed to download {url} to temporary file: {e}")
            return None
        except zipfile.BadZipFile:
            self.logger.error(f"Downloaded file for {tile_id} is not a valid zip file.")
            return None
        except Exception as e:
            self.logger.error(f"Error downloading/extracting tile {tile_id}: {e}")
            return None
        finally:
            if temp_downloaded_path and temp_downloaded_path.exists():
                try:
                    temp_downloaded_path.unlink()
                    self.logger.debug(f"Deleted temporary file: {temp_downloaded_path}")
                except OSError as e:
                    self.logger.warning(
                        f"Could not delete temporary file {temp_downloaded_path}: {e}"
                    )

    def download_data_units(
        self,
        tile_ids: List[str],
        extract: bool = True,
        file_pattern: Optional[str] = r".*\.tif$",
        **kwargs,
    ) -> List[Optional[Union[Path, List[Path]]]]:
        """
        Downloads multiple tiles in parallel, with an option to extract them.

        Args:
            tile_ids: A list of tile IDs to download.
            extract: If True and the downloaded files are zips, extract their contents. Defaults to True.
            file_pattern: Optional regex pattern to filter extracted files (if extract=True).
            **kwargs: Additional parameters passed to download methods

        Returns:
            A list where each element corresponds to a tile ID and contains:
            - Path to the downloaded file if extract=False.
            - List of paths to extracted files if extract=True.
            - None if the download or extraction failed for a tile.
        """
        if not tile_ids:
            self.logger.warning("No tiles to download")
            return []

        with multiprocessing.Pool(processes=self.config.n_workers) as pool:
            download_func = functools.partial(
                self.download_data_unit, extract=extract, file_pattern=file_pattern
            )
            file_paths = list(
                tqdm(
                    pool.imap(download_func, tile_ids),
                    total=len(tile_ids),
                    desc=f"Downloading data",
                )
            )

        return file_paths

    def download(
        self,
        source: Union[
            str,  # country
            List[Union[Tuple[float, float], Point]],  # points
            BaseGeometry,  # shapely geoms
            gpd.GeoDataFrame,
        ],
        extract: bool = True,
        file_pattern: Optional[str] = r".*\.tif$",
        **kwargs,
    ) -> List[Optional[Union[Path, List[Path]]]]:
        """
        Download GHSL data for a specified geographic region.

        The region can be defined by a country code/name, a list of points,
        a Shapely geometry, or a GeoDataFrame. This method identifies the
        relevant GHSL tiles intersecting the region and downloads the
        specified type of data (polygons or points) for those tiles in parallel.

        Args:
            source: Defines the geographic area for which to download data.
                    Can be:
                      - A string representing a country code or name.
                      - A list of (latitude, longitude) tuples or Shapely Point objects.
                      - A Shapely BaseGeometry object (e.g., Polygon, MultiPolygon).
                      - A GeoDataFrame with geometry column in EPSG:4326.
            extract: If True and the downloaded files are zips, extract their contents. Defaults to True.
            file_pattern: Optional regex pattern to filter extracted files (if extract=True).
            **kwargs: Additional keyword arguments. These will be passed down to
                      `AdminBoundaries.create()` (if `source` is a country)
                      and to `self.download_data_units()`.

        Returns:
            A list of local file paths for the successfully downloaded tiles.
            Returns an empty list if no data is found for the region or if
            all downloads fail.
        """

        tiles = self.config.get_relevant_data_units(source, **kwargs)
        return self.download_data_units(
            tiles, extract=extract, file_pattern=file_pattern, **kwargs
        )

    def download_by_country(
        self,
        country_code: str,
        data_store: Optional[DataStore] = None,
        country_geom_path: Optional[Union[str, Path]] = None,
        extract: bool = True,
        file_pattern: Optional[str] = r".*\.tif$",
        **kwargs,
    ) -> List[Optional[Union[Path, List[Path]]]]:
        """
        Download GHSL data for a specific country.

        This is a convenience method to download data for an entire country
        using its code or name.

        Args:
            country_code: The country code (e.g., 'USA', 'GBR') or name.
            data_store: Optional instance of a `DataStore` to be used by
                        `AdminBoundaries` for loading country boundaries. If None,
                        `AdminBoundaries` will use its default data loading.
            country_geom_path: Optional path to a GeoJSON file containing the
                               country boundary. If provided, this boundary is used
                               instead of the default from `AdminBoundaries`.
            extract: If True and the downloaded files are zips, extract their contents. Defaults to True.
            file_pattern: Optional regex pattern to filter extracted files (if extract=True).
            **kwargs: Additional keyword arguments that are passed to
                      `download_data_units`. For example, `extract` to download and extract.

        Returns:
            A list of local file paths for the successfully downloaded tiles
            for the specified country.
        """
        return self.download(
            source=country_code,
            data_store=data_store,
            path=country_geom_path,
            extract=extract,
            file_pattern=file_pattern,
            **kwargs,
        )

    def _download_file(self, url: str, output_path: Path) -> Optional[Path]:
        """
        Downloads a file from a URL to a specified output path with a progress bar.

        Args:
            url: The URL to download from.
            output_path: The local path to save the downloaded file.

        Returns:
            The path to the downloaded file on success, None on failure.
        """
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()

            total_size = int(response.headers.get("content-length", 0))

            with self.data_store.open(str(output_path), "wb") as file:
                with tqdm(
                    total=total_size,
                    unit="B",
                    unit_scale=True,
                    desc=f"Downloading {output_path.name}",
                ) as pbar:
                    for chunk in response.iter_content(chunk_size=8192):
                        if chunk:
                            file.write(chunk)
                            pbar.update(len(chunk))

            self.logger.debug(f"Successfully downloaded: {url} to {output_path}")
            return output_path

        except requests.exceptions.RequestException as e:
            self.logger.error(f"Failed to download {url}: {str(e)}")
            return None
        except Exception as e:
            self.logger.error(f"Unexpected error downloading {url}: {str(e)}")
            return None
__init__(config, data_store=None, logger=None)

Initialize the downloader.

Parameters:

Name Type Description Default
config Union[GHSLDataConfig, dict[str, Union[str, int]]]

Configuration for the GHSL dataset, either as a GHSLDataConfig object or a dictionary of parameters

required
data_store Optional[DataStore]

Optional data storage interface. If not provided, uses LocalDataStore.

None
logger Optional[Logger]

Optional custom logger. If not provided, uses default logger.

None
Source code in gigaspatial/handlers/ghsl.py
def __init__(
    self,
    config: Union[GHSLDataConfig, dict[str, Union[str, int]]],
    data_store: Optional[DataStore] = None,
    logger: Optional[logging.Logger] = None,
):
    """
    Initialize the downloader.

    Args:
        config: Configuration for the GHSL dataset, either as a GHSLDataConfig object or a dictionary of parameters
        data_store: Optional data storage interface. If not provided, uses LocalDataStore.
        logger: Optional custom logger. If not provided, uses default logger.
    """
    config = (
        config if isinstance(config, GHSLDataConfig) else GHSLDataConfig(**config)
    )
    super().__init__(config=config, data_store=data_store, logger=logger)
download(source, extract=True, file_pattern='.*\\.tif$', **kwargs)

Download GHSL data for a specified geographic region.

The region can be defined by a country code/name, a list of points, a Shapely geometry, or a GeoDataFrame. This method identifies the relevant GHSL tiles intersecting the region and downloads the specified type of data (polygons or points) for those tiles in parallel.

Parameters:

Name Type Description Default
source Union[str, List[Union[Tuple[float, float], Point]], BaseGeometry, GeoDataFrame]

Defines the geographic area for which to download data. Can be: - A string representing a country code or name. - A list of (latitude, longitude) tuples or Shapely Point objects. - A Shapely BaseGeometry object (e.g., Polygon, MultiPolygon). - A GeoDataFrame with geometry column in EPSG:4326.

required
extract bool

If True and the downloaded files are zips, extract their contents. Defaults to True.

True
file_pattern Optional[str]

Optional regex pattern to filter extracted files (if extract=True).

'.*\\.tif$'
**kwargs

Additional keyword arguments. These will be passed down to AdminBoundaries.create() (if source is a country) and to self.download_data_units().

{}

Returns:

Type Description
List[Optional[Union[Path, List[Path]]]]

A list of local file paths for the successfully downloaded tiles.

List[Optional[Union[Path, List[Path]]]]

Returns an empty list if no data is found for the region or if

List[Optional[Union[Path, List[Path]]]]

all downloads fail.

Source code in gigaspatial/handlers/ghsl.py
def download(
    self,
    source: Union[
        str,  # country
        List[Union[Tuple[float, float], Point]],  # points
        BaseGeometry,  # shapely geoms
        gpd.GeoDataFrame,
    ],
    extract: bool = True,
    file_pattern: Optional[str] = r".*\.tif$",
    **kwargs,
) -> List[Optional[Union[Path, List[Path]]]]:
    """
    Download GHSL data for a specified geographic region.

    The region can be defined by a country code/name, a list of points,
    a Shapely geometry, or a GeoDataFrame. This method identifies the
    relevant GHSL tiles intersecting the region and downloads the
    specified type of data (polygons or points) for those tiles in parallel.

    Args:
        source: Defines the geographic area for which to download data.
                Can be:
                  - A string representing a country code or name.
                  - A list of (latitude, longitude) tuples or Shapely Point objects.
                  - A Shapely BaseGeometry object (e.g., Polygon, MultiPolygon).
                  - A GeoDataFrame with geometry column in EPSG:4326.
        extract: If True and the downloaded files are zips, extract their contents. Defaults to True.
        file_pattern: Optional regex pattern to filter extracted files (if extract=True).
        **kwargs: Additional keyword arguments. These will be passed down to
                  `AdminBoundaries.create()` (if `source` is a country)
                  and to `self.download_data_units()`.

    Returns:
        A list of local file paths for the successfully downloaded tiles.
        Returns an empty list if no data is found for the region or if
        all downloads fail.
    """

    tiles = self.config.get_relevant_data_units(source, **kwargs)
    return self.download_data_units(
        tiles, extract=extract, file_pattern=file_pattern, **kwargs
    )
download_by_country(country_code, data_store=None, country_geom_path=None, extract=True, file_pattern='.*\\.tif$', **kwargs)

Download GHSL data for a specific country.

This is a convenience method to download data for an entire country using its code or name.

Parameters:

Name Type Description Default
country_code str

The country code (e.g., 'USA', 'GBR') or name.

required
data_store Optional[DataStore]

Optional instance of a DataStore to be used by AdminBoundaries for loading country boundaries. If None, AdminBoundaries will use its default data loading.

None
country_geom_path Optional[Union[str, Path]]

Optional path to a GeoJSON file containing the country boundary. If provided, this boundary is used instead of the default from AdminBoundaries.

None
extract bool

If True and the downloaded files are zips, extract their contents. Defaults to True.

True
file_pattern Optional[str]

Optional regex pattern to filter extracted files (if extract=True).

'.*\\.tif$'
**kwargs

Additional keyword arguments that are passed to download_data_units. For example, extract to download and extract.

{}

Returns:

Type Description
List[Optional[Union[Path, List[Path]]]]

A list of local file paths for the successfully downloaded tiles

List[Optional[Union[Path, List[Path]]]]

for the specified country.

Source code in gigaspatial/handlers/ghsl.py
def download_by_country(
    self,
    country_code: str,
    data_store: Optional[DataStore] = None,
    country_geom_path: Optional[Union[str, Path]] = None,
    extract: bool = True,
    file_pattern: Optional[str] = r".*\.tif$",
    **kwargs,
) -> List[Optional[Union[Path, List[Path]]]]:
    """
    Download GHSL data for a specific country.

    This is a convenience method to download data for an entire country
    using its code or name.

    Args:
        country_code: The country code (e.g., 'USA', 'GBR') or name.
        data_store: Optional instance of a `DataStore` to be used by
                    `AdminBoundaries` for loading country boundaries. If None,
                    `AdminBoundaries` will use its default data loading.
        country_geom_path: Optional path to a GeoJSON file containing the
                           country boundary. If provided, this boundary is used
                           instead of the default from `AdminBoundaries`.
        extract: If True and the downloaded files are zips, extract their contents. Defaults to True.
        file_pattern: Optional regex pattern to filter extracted files (if extract=True).
        **kwargs: Additional keyword arguments that are passed to
                  `download_data_units`. For example, `extract` to download and extract.

    Returns:
        A list of local file paths for the successfully downloaded tiles
        for the specified country.
    """
    return self.download(
        source=country_code,
        data_store=data_store,
        path=country_geom_path,
        extract=extract,
        file_pattern=file_pattern,
        **kwargs,
    )
download_data_unit(tile_id, extract=True, file_pattern='.*\\.tif$', **kwargs)

Downloads and optionally extracts files for a given tile.

Parameters:

Name Type Description Default
tile_id str

tile ID to process.

required
extract bool

If True and the downloaded file is a zip, extract its contents. Defaults to True.

True
file_pattern Optional[str]

Optional regex pattern to filter extracted files (if extract=True).

'.*\\.tif$'
**kwargs

Additional parameters passed to download methods

{}

Returns:

Type Description
Optional[Union[Path, List[Path]]]

Path to the downloaded file if extract=False,

Optional[Union[Path, List[Path]]]

List of paths to the extracted files if extract=True,

Optional[Union[Path, List[Path]]]

None on failure.

Source code in gigaspatial/handlers/ghsl.py
def download_data_unit(
    self,
    tile_id: str,
    extract: bool = True,
    file_pattern: Optional[str] = r".*\.tif$",
    **kwargs,
) -> Optional[Union[Path, List[Path]]]:
    """
    Downloads and optionally extracts files for a given tile.

    Args:
        tile_id: tile ID to process.
        extract: If True and the downloaded file is a zip, extract its contents. Defaults to True.
        file_pattern: Optional regex pattern to filter extracted files (if extract=True).
        **kwargs: Additional parameters passed to download methods

    Returns:
        Path to the downloaded file if extract=False,
        List of paths to the extracted files if extract=True,
        None on failure.
    """
    url = self.config.compute_dataset_url(tile_id=tile_id)
    output_path = self.config.get_data_unit_path(tile_id)

    if not extract:
        return self._download_file(url, output_path)

    extracted_files: List[Path] = []
    temp_downloaded_path: Optional[Path] = None

    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as temp_file:
            temp_downloaded_path = Path(temp_file.name)
            self.logger.debug(
                f"Downloading {url} to temporary file: {temp_downloaded_path}"
            )

            response = requests.get(url, stream=True)
            response.raise_for_status()

            total_size = int(response.headers.get("content-length", 0))

            with tqdm(
                total=total_size,
                unit="B",
                unit_scale=True,
                desc=f"Downloading {tile_id}",
            ) as pbar:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        temp_file.write(chunk)
                        pbar.update(len(chunk))

        self.logger.info(f"Successfully downloaded temporary file!")

        with zipfile.ZipFile(str(temp_downloaded_path), "r") as zip_ref:
            if file_pattern:
                import re

                pattern = re.compile(file_pattern)
                files_to_extract = [
                    f for f in zip_ref.namelist() if pattern.match(f)
                ]
            else:
                files_to_extract = zip_ref.namelist()

            for file in files_to_extract:
                extracted_path = output_path.parent / Path(file).name
                with zip_ref.open(file) as source:
                    file_content = source.read()
                    self.data_store.write_file(str(extracted_path), file_content)
                extracted_files.append(extracted_path)
                self.logger.info(f"Extracted {file} to {extracted_path}")

        Path(temp_file.name).unlink()
        return extracted_files

    except requests.exceptions.RequestException as e:
        self.logger.error(f"Failed to download {url} to temporary file: {e}")
        return None
    except zipfile.BadZipFile:
        self.logger.error(f"Downloaded file for {tile_id} is not a valid zip file.")
        return None
    except Exception as e:
        self.logger.error(f"Error downloading/extracting tile {tile_id}: {e}")
        return None
    finally:
        if temp_downloaded_path and temp_downloaded_path.exists():
            try:
                temp_downloaded_path.unlink()
                self.logger.debug(f"Deleted temporary file: {temp_downloaded_path}")
            except OSError as e:
                self.logger.warning(
                    f"Could not delete temporary file {temp_downloaded_path}: {e}"
                )
download_data_units(tile_ids, extract=True, file_pattern='.*\\.tif$', **kwargs)

Downloads multiple tiles in parallel, with an option to extract them.

Parameters:

Name Type Description Default
tile_ids List[str]

A list of tile IDs to download.

required
extract bool

If True and the downloaded files are zips, extract their contents. Defaults to True.

True
file_pattern Optional[str]

Optional regex pattern to filter extracted files (if extract=True).

'.*\\.tif$'
**kwargs

Additional parameters passed to download methods

{}

Returns:

Type Description
List[Optional[Union[Path, List[Path]]]]

A list where each element corresponds to a tile ID and contains:

List[Optional[Union[Path, List[Path]]]]
  • Path to the downloaded file if extract=False.
List[Optional[Union[Path, List[Path]]]]
  • List of paths to extracted files if extract=True.
List[Optional[Union[Path, List[Path]]]]
  • None if the download or extraction failed for a tile.
Source code in gigaspatial/handlers/ghsl.py
def download_data_units(
    self,
    tile_ids: List[str],
    extract: bool = True,
    file_pattern: Optional[str] = r".*\.tif$",
    **kwargs,
) -> List[Optional[Union[Path, List[Path]]]]:
    """
    Downloads multiple tiles in parallel, with an option to extract them.

    Args:
        tile_ids: A list of tile IDs to download.
        extract: If True and the downloaded files are zips, extract their contents. Defaults to True.
        file_pattern: Optional regex pattern to filter extracted files (if extract=True).
        **kwargs: Additional parameters passed to download methods

    Returns:
        A list where each element corresponds to a tile ID and contains:
        - Path to the downloaded file if extract=False.
        - List of paths to extracted files if extract=True.
        - None if the download or extraction failed for a tile.
    """
    if not tile_ids:
        self.logger.warning("No tiles to download")
        return []

    with multiprocessing.Pool(processes=self.config.n_workers) as pool:
        download_func = functools.partial(
            self.download_data_unit, extract=extract, file_pattern=file_pattern
        )
        file_paths = list(
            tqdm(
                pool.imap(download_func, tile_ids),
                total=len(tile_ids),
                desc=f"Downloading data",
            )
        )

    return file_paths

GHSLDataHandler

Bases: BaseHandler

Handler for GHSL (Global Human Settlement Layer) dataset.

This class provides a unified interface for downloading and loading GHSL data. It manages the lifecycle of configuration, downloading, and reading components.

Source code in gigaspatial/handlers/ghsl.py
class GHSLDataHandler(BaseHandler):
    """
    Handler for GHSL (Global Human Settlement Layer) dataset.

    This class provides a unified interface for downloading and loading GHSL data.
    It manages the lifecycle of configuration, downloading, and reading components.
    """

    def __init__(
        self,
        product: Literal[
            "GHS_BUILT_S",
            "GHS_BUILT_H_AGBH",
            "GHS_BUILT_H_ANBH",
            "GHS_BUILT_V",
            "GHS_POP",
            "GHS_SMOD",
        ],
        year: int = 2020,
        resolution: int = 100,
        config: Optional[GHSLDataConfig] = None,
        downloader: Optional[GHSLDataDownloader] = None,
        reader: Optional[GHSLDataReader] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
        **kwargs,
    ):
        """
        Initialize the GHSLDataHandler.

        Args:
            product: The GHSL product to use. Must be one of:
                    - GHS_BUILT_S: Built-up surface
                    - GHS_BUILT_H_AGBH: Average building height
                    - GHS_BUILT_H_ANBH: Average number of building heights
                    - GHS_BUILT_V: Building volume
                    - GHS_POP: Population
                    - GHS_SMOD: Settlement model
            year: The year of the data (default: 2020)
            resolution: The resolution in meters (default: 100)
            config: Optional configuration object
            downloader: Optional downloader instance
            reader: Optional reader instance
            data_store: Optional data store instance
            logger: Optional logger instance
            **kwargs: Additional configuration parameters
        """
        self._product = product
        self._year = year
        self._resolution = resolution
        super().__init__(
            config=config,
            downloader=downloader,
            reader=reader,
            data_store=data_store,
            logger=logger,
        )

    def create_config(
        self, data_store: DataStore, logger: logging.Logger, **kwargs
    ) -> GHSLDataConfig:
        """
        Create and return a GHSLDataConfig instance.

        Args:
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional configuration parameters

        Returns:
            Configured GHSLDataConfig instance
        """
        return GHSLDataConfig(
            product=self._product,
            year=self._year,
            resolution=self._resolution,
            data_store=data_store,
            logger=logger,
            **kwargs,
        )

    def create_downloader(
        self,
        config: GHSLDataConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> GHSLDataDownloader:
        """
        Create and return a GHSLDataDownloader instance.

        Args:
            config: The configuration object
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional downloader parameters

        Returns:
            Configured GHSLDataDownloader instance
        """
        return GHSLDataDownloader(
            config=config, data_store=data_store, logger=logger, **kwargs
        )

    def create_reader(
        self,
        config: GHSLDataConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> GHSLDataReader:
        """
        Create and return a GHSLDataReader instance.

        Args:
            config: The configuration object
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional reader parameters

        Returns:
            Configured GHSLDataReader instance
        """
        return GHSLDataReader(
            config=config, data_store=data_store, logger=logger, **kwargs
        )

    def load_data(
        self,
        source: Union[
            str,  # country
            List[Union[tuple, Point]],  # points
            BaseGeometry,  # geometry
            gpd.GeoDataFrame,  # geodataframe
            Path,  # path
            List[Union[str, Path]],  # list of paths
        ],
        crop_to_source: bool = False,
        ensure_available: bool = True,
        merge_rasters: bool = False,
        **kwargs,
    ):
        return super().load_data(
            source=source,
            crop_to_source=crop_to_source,
            ensure_available=ensure_available,
            file_ext=".tif",
            extract=True,
            file_pattern=r".*\.tif$",
            merge_rasters=merge_rasters,
            **kwargs,
        )

    def load_into_dataframe(
        self,
        source: Union[
            str,  # country
            List[Union[tuple, Point]],  # points
            BaseGeometry,  # geometry
            gpd.GeoDataFrame,  # geodataframe
            Path,  # path
            List[Union[str, Path]],  # list of paths
        ],
        crop_to_source: bool = False,
        ensure_available: bool = True,
        **kwargs,
    ) -> pd.DataFrame:
        """
        Load GHSL data into a pandas DataFrame.

        Args:
            source: The data source specification
            ensure_available: If True, ensure data is downloaded before loading
            **kwargs: Additional parameters passed to load methods

        Returns:
            DataFrame containing the GHSL data
        """
        tif_processors = self.load_data(
            source=source,
            crop_to_source=crop_to_source,
            ensure_available=ensure_available,
            **kwargs,
        )
        if isinstance(tif_processors, TifProcessor):
            return tif_processors.to_dataframe(**kwargs)
        return pd.concat(
            [tp.to_dataframe(**kwargs) for tp in tif_processors], ignore_index=True
        )

    def load_into_geodataframe(
        self,
        source: Union[
            str,  # country
            List[Union[tuple, Point]],  # points
            BaseGeometry,  # geometry
            gpd.GeoDataFrame,  # geodataframe
            Path,  # path
            List[Union[str, Path]],  # list of paths
        ],
        crop_to_source: bool = False,
        ensure_available: bool = True,
        **kwargs,
    ) -> gpd.GeoDataFrame:
        """
        Load GHSL data into a geopandas GeoDataFrame.

        Args:
            source: The data source specification
            ensure_available: If True, ensure data is downloaded before loading
            **kwargs: Additional parameters passed to load methods

        Returns:
            GeoDataFrame containing the GHSL data
        """
        tif_processors = self.load_data(
            source=source,
            crop_to_source=crop_to_source,
            ensure_available=ensure_available,
            **kwargs,
        )
        if isinstance(tif_processors, TifProcessor):
            return tif_processors.to_geodataframe(**kwargs)
        return pd.concat(
            [tp.to_geodataframe(**kwargs) for tp in tif_processors], ignore_index=True
        )

    def get_available_data_info(
        self,
        source: Union[
            str,  # country
            List[Union[tuple, Point]],  # points
            BaseGeometry,  # geometry
            gpd.GeoDataFrame,  # geodataframe
        ],
        **kwargs,
    ) -> dict:
        return super().get_available_data_info(source, file_ext=".tif", **kwargs)
__init__(product, year=2020, resolution=100, config=None, downloader=None, reader=None, data_store=None, logger=None, **kwargs)

Initialize the GHSLDataHandler.

Parameters:

Name Type Description Default
product Literal['GHS_BUILT_S', 'GHS_BUILT_H_AGBH', 'GHS_BUILT_H_ANBH', 'GHS_BUILT_V', 'GHS_POP', 'GHS_SMOD']

The GHSL product to use. Must be one of: - GHS_BUILT_S: Built-up surface - GHS_BUILT_H_AGBH: Average building height - GHS_BUILT_H_ANBH: Average number of building heights - GHS_BUILT_V: Building volume - GHS_POP: Population - GHS_SMOD: Settlement model

required
year int

The year of the data (default: 2020)

2020
resolution int

The resolution in meters (default: 100)

100
config Optional[GHSLDataConfig]

Optional configuration object

None
downloader Optional[GHSLDataDownloader]

Optional downloader instance

None
reader Optional[GHSLDataReader]

Optional reader instance

None
data_store Optional[DataStore]

Optional data store instance

None
logger Optional[Logger]

Optional logger instance

None
**kwargs

Additional configuration parameters

{}
Source code in gigaspatial/handlers/ghsl.py
def __init__(
    self,
    product: Literal[
        "GHS_BUILT_S",
        "GHS_BUILT_H_AGBH",
        "GHS_BUILT_H_ANBH",
        "GHS_BUILT_V",
        "GHS_POP",
        "GHS_SMOD",
    ],
    year: int = 2020,
    resolution: int = 100,
    config: Optional[GHSLDataConfig] = None,
    downloader: Optional[GHSLDataDownloader] = None,
    reader: Optional[GHSLDataReader] = None,
    data_store: Optional[DataStore] = None,
    logger: Optional[logging.Logger] = None,
    **kwargs,
):
    """
    Initialize the GHSLDataHandler.

    Args:
        product: The GHSL product to use. Must be one of:
                - GHS_BUILT_S: Built-up surface
                - GHS_BUILT_H_AGBH: Average building height
                - GHS_BUILT_H_ANBH: Average number of building heights
                - GHS_BUILT_V: Building volume
                - GHS_POP: Population
                - GHS_SMOD: Settlement model
        year: The year of the data (default: 2020)
        resolution: The resolution in meters (default: 100)
        config: Optional configuration object
        downloader: Optional downloader instance
        reader: Optional reader instance
        data_store: Optional data store instance
        logger: Optional logger instance
        **kwargs: Additional configuration parameters
    """
    self._product = product
    self._year = year
    self._resolution = resolution
    super().__init__(
        config=config,
        downloader=downloader,
        reader=reader,
        data_store=data_store,
        logger=logger,
    )
create_config(data_store, logger, **kwargs)

Create and return a GHSLDataConfig instance.

Parameters:

Name Type Description Default
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional configuration parameters

{}

Returns:

Type Description
GHSLDataConfig

Configured GHSLDataConfig instance

Source code in gigaspatial/handlers/ghsl.py
def create_config(
    self, data_store: DataStore, logger: logging.Logger, **kwargs
) -> GHSLDataConfig:
    """
    Create and return a GHSLDataConfig instance.

    Args:
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional configuration parameters

    Returns:
        Configured GHSLDataConfig instance
    """
    return GHSLDataConfig(
        product=self._product,
        year=self._year,
        resolution=self._resolution,
        data_store=data_store,
        logger=logger,
        **kwargs,
    )
create_downloader(config, data_store, logger, **kwargs)

Create and return a GHSLDataDownloader instance.

Parameters:

Name Type Description Default
config GHSLDataConfig

The configuration object

required
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional downloader parameters

{}

Returns:

Type Description
GHSLDataDownloader

Configured GHSLDataDownloader instance

Source code in gigaspatial/handlers/ghsl.py
def create_downloader(
    self,
    config: GHSLDataConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> GHSLDataDownloader:
    """
    Create and return a GHSLDataDownloader instance.

    Args:
        config: The configuration object
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional downloader parameters

    Returns:
        Configured GHSLDataDownloader instance
    """
    return GHSLDataDownloader(
        config=config, data_store=data_store, logger=logger, **kwargs
    )
create_reader(config, data_store, logger, **kwargs)

Create and return a GHSLDataReader instance.

Parameters:

Name Type Description Default
config GHSLDataConfig

The configuration object

required
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional reader parameters

{}

Returns:

Type Description
GHSLDataReader

Configured GHSLDataReader instance

Source code in gigaspatial/handlers/ghsl.py
def create_reader(
    self,
    config: GHSLDataConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> GHSLDataReader:
    """
    Create and return a GHSLDataReader instance.

    Args:
        config: The configuration object
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional reader parameters

    Returns:
        Configured GHSLDataReader instance
    """
    return GHSLDataReader(
        config=config, data_store=data_store, logger=logger, **kwargs
    )
load_into_dataframe(source, crop_to_source=False, ensure_available=True, **kwargs)

Load GHSL data into a pandas DataFrame.

Parameters:

Name Type Description Default
source Union[str, List[Union[tuple, Point]], BaseGeometry, GeoDataFrame, Path, List[Union[str, Path]]]

The data source specification

required
ensure_available bool

If True, ensure data is downloaded before loading

True
**kwargs

Additional parameters passed to load methods

{}

Returns:

Type Description
DataFrame

DataFrame containing the GHSL data

Source code in gigaspatial/handlers/ghsl.py
def load_into_dataframe(
    self,
    source: Union[
        str,  # country
        List[Union[tuple, Point]],  # points
        BaseGeometry,  # geometry
        gpd.GeoDataFrame,  # geodataframe
        Path,  # path
        List[Union[str, Path]],  # list of paths
    ],
    crop_to_source: bool = False,
    ensure_available: bool = True,
    **kwargs,
) -> pd.DataFrame:
    """
    Load GHSL data into a pandas DataFrame.

    Args:
        source: The data source specification
        ensure_available: If True, ensure data is downloaded before loading
        **kwargs: Additional parameters passed to load methods

    Returns:
        DataFrame containing the GHSL data
    """
    tif_processors = self.load_data(
        source=source,
        crop_to_source=crop_to_source,
        ensure_available=ensure_available,
        **kwargs,
    )
    if isinstance(tif_processors, TifProcessor):
        return tif_processors.to_dataframe(**kwargs)
    return pd.concat(
        [tp.to_dataframe(**kwargs) for tp in tif_processors], ignore_index=True
    )
load_into_geodataframe(source, crop_to_source=False, ensure_available=True, **kwargs)

Load GHSL data into a geopandas GeoDataFrame.

Parameters:

Name Type Description Default
source Union[str, List[Union[tuple, Point]], BaseGeometry, GeoDataFrame, Path, List[Union[str, Path]]]

The data source specification

required
ensure_available bool

If True, ensure data is downloaded before loading

True
**kwargs

Additional parameters passed to load methods

{}

Returns:

Type Description
GeoDataFrame

GeoDataFrame containing the GHSL data

Source code in gigaspatial/handlers/ghsl.py
def load_into_geodataframe(
    self,
    source: Union[
        str,  # country
        List[Union[tuple, Point]],  # points
        BaseGeometry,  # geometry
        gpd.GeoDataFrame,  # geodataframe
        Path,  # path
        List[Union[str, Path]],  # list of paths
    ],
    crop_to_source: bool = False,
    ensure_available: bool = True,
    **kwargs,
) -> gpd.GeoDataFrame:
    """
    Load GHSL data into a geopandas GeoDataFrame.

    Args:
        source: The data source specification
        ensure_available: If True, ensure data is downloaded before loading
        **kwargs: Additional parameters passed to load methods

    Returns:
        GeoDataFrame containing the GHSL data
    """
    tif_processors = self.load_data(
        source=source,
        crop_to_source=crop_to_source,
        ensure_available=ensure_available,
        **kwargs,
    )
    if isinstance(tif_processors, TifProcessor):
        return tif_processors.to_geodataframe(**kwargs)
    return pd.concat(
        [tp.to_geodataframe(**kwargs) for tp in tif_processors], ignore_index=True
    )

GHSLDataReader

Bases: BaseHandlerReader

Source code in gigaspatial/handlers/ghsl.py
class GHSLDataReader(BaseHandlerReader):

    def __init__(
        self,
        config: Union[GHSLDataConfig, dict[str, Union[str, int]]],
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        """
        Initialize the reader.

        Args:
            config: Configuration for the GHSL dataset, either as a GHSLDataConfig object or a dictionary of parameters
            data_store: Optional data storage interface. If not provided, uses LocalDataStore.
            logger: Optional custom logger. If not provided, uses default logger.
        """
        config = (
            config if isinstance(config, GHSLDataConfig) else GHSLDataConfig(**config)
        )
        super().__init__(config=config, data_store=data_store, logger=logger)

    def load_from_paths(
        self,
        source_data_path: List[Union[str, Path]],
        merge_rasters: bool = False,
        **kwargs,
    ) -> Union[List[TifProcessor], TifProcessor]:
        """
        Load TifProcessors from GHSL dataset.
        Args:
            source_data_path: List of file paths to load
            merge_rasters: If True, all rasters will be merged into a single TifProcessor.
                           Defaults to False.
        Returns:
            Union[List[TifProcessor], TifProcessor]: List of TifProcessor objects for accessing the raster data or a single
                                                    TifProcessor if merge_rasters is True.
        """
        return self._load_raster_data(
            raster_paths=source_data_path, merge_rasters=merge_rasters
        )

    def load(
        self,
        source,
        crop_to_source: bool = False,
        merge_rasters: bool = False,
        **kwargs,
    ):
        return super().load(
            source=source,
            crop_to_source=crop_to_source,
            file_ext=kwargs.pop("file_ext", ".tif"),
            merge_rasters=merge_rasters,
            **kwargs,
        )
__init__(config, data_store=None, logger=None)

Initialize the reader.

Parameters:

Name Type Description Default
config Union[GHSLDataConfig, dict[str, Union[str, int]]]

Configuration for the GHSL dataset, either as a GHSLDataConfig object or a dictionary of parameters

required
data_store Optional[DataStore]

Optional data storage interface. If not provided, uses LocalDataStore.

None
logger Optional[Logger]

Optional custom logger. If not provided, uses default logger.

None
Source code in gigaspatial/handlers/ghsl.py
def __init__(
    self,
    config: Union[GHSLDataConfig, dict[str, Union[str, int]]],
    data_store: Optional[DataStore] = None,
    logger: Optional[logging.Logger] = None,
):
    """
    Initialize the reader.

    Args:
        config: Configuration for the GHSL dataset, either as a GHSLDataConfig object or a dictionary of parameters
        data_store: Optional data storage interface. If not provided, uses LocalDataStore.
        logger: Optional custom logger. If not provided, uses default logger.
    """
    config = (
        config if isinstance(config, GHSLDataConfig) else GHSLDataConfig(**config)
    )
    super().__init__(config=config, data_store=data_store, logger=logger)
load_from_paths(source_data_path, merge_rasters=False, **kwargs)

Load TifProcessors from GHSL dataset. Args: source_data_path: List of file paths to load merge_rasters: If True, all rasters will be merged into a single TifProcessor. Defaults to False. Returns: Union[List[TifProcessor], TifProcessor]: List of TifProcessor objects for accessing the raster data or a single TifProcessor if merge_rasters is True.

Source code in gigaspatial/handlers/ghsl.py
def load_from_paths(
    self,
    source_data_path: List[Union[str, Path]],
    merge_rasters: bool = False,
    **kwargs,
) -> Union[List[TifProcessor], TifProcessor]:
    """
    Load TifProcessors from GHSL dataset.
    Args:
        source_data_path: List of file paths to load
        merge_rasters: If True, all rasters will be merged into a single TifProcessor.
                       Defaults to False.
    Returns:
        Union[List[TifProcessor], TifProcessor]: List of TifProcessor objects for accessing the raster data or a single
                                                TifProcessor if merge_rasters is True.
    """
    return self._load_raster_data(
        raster_paths=source_data_path, merge_rasters=merge_rasters
    )

giga

GigaSchoolLocationFetcher

Fetch and process school location data from the Giga School Geolocation Data API.

Source code in gigaspatial/handlers/giga.py
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class GigaSchoolLocationFetcher:
    """
    Fetch and process school location data from the Giga School Geolocation Data API.
    """

    country: str = Field(...)
    api_url: str = Field(
        default="https://uni-ooi-giga-maps-service.azurewebsites.net/api/v1/schools_location/country/{isocode3}",
        description="Base URL for the Giga School API",
    )
    api_key: str = global_config.GIGA_SCHOOL_LOCATION_API_KEY
    page_size: int = Field(default=1000, description="Number of records per API page")
    sleep_time: float = Field(
        default=0.2, description="Sleep time between API requests"
    )

    logger: logging.Logger = Field(default=None, repr=False)

    def __post_init__(self):
        try:
            self.country = pycountry.countries.lookup(self.country).alpha_3
        except LookupError:
            raise ValueError(f"Invalid country code provided: {self.country}")
        self.api_url = self.api_url.format(isocode3=self.country)
        if self.logger is None:
            self.logger = global_config.get_logger(self.__class__.__name__)

    def fetch_locations(
        self, process_geospatial: bool = False, **kwargs
    ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
        """
        Fetch and process school locations.

        Args:
            process_geospatial (bool): Whether to process geospatial data and return a GeoDataFrame. Defaults to False.
            **kwargs: Additional parameters for customization
                - page_size: Override default page size
                - sleep_time: Override default sleep time between requests
                - max_pages: Limit the number of pages to fetch

        Returns:
            pd.DataFrame: School locations with geospatial info.
        """
        # Override defaults with kwargs if provided
        page_size = kwargs.get("page_size", self.page_size)
        sleep_time = kwargs.get("sleep_time", self.sleep_time)
        max_pages = kwargs.get("max_pages", None)

        # Prepare headers
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Accept": "application/json",
        }

        all_data = []
        page = 1

        self.logger.info(
            f"Starting to fetch school locations for country: {self.country}"
        )

        while True:
            # Check if we've reached max_pages limit
            if max_pages and page > max_pages:
                self.logger.info(f"Reached maximum pages limit: {max_pages}")
                break

            params = {"page": page, "size": page_size}

            try:
                self.logger.debug(f"Fetching page {page} with params: {params}")
                response = requests.get(self.api_url, headers=headers, params=params)
                response.raise_for_status()

                parsed = response.json()
                data = parsed.get("data", [])

            except requests.exceptions.RequestException as e:
                self.logger.error(f"Request failed on page {page}: {e}")
                break
            except ValueError as e:
                self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
                break

            # Check if we got any data
            if not data:
                self.logger.info(f"No data on page {page}. Stopping.")
                break

            all_data.extend(data)
            self.logger.info(f"Fetched page {page} with {len(data)} records")

            # If we got fewer records than page_size, we've reached the end
            if len(data) < page_size:
                self.logger.info("Reached end of data (partial page received)")
                break

            page += 1

            # Sleep to be respectful to the API
            if sleep_time > 0:
                time.sleep(sleep_time)

        self.logger.info(f"Finished fetching. Total records: {len(all_data)}")

        # Convert to DataFrame and process
        if not all_data:
            self.logger.warning("No data fetched, returning empty DataFrame")
            return pd.DataFrame()

        df = pd.DataFrame(all_data)

        if process_geospatial:
            df = self._process_geospatial_data(df)

        return df

    def _process_geospatial_data(self, df: pd.DataFrame) -> gpd.GeoDataFrame:
        """
        Process and enhance the DataFrame with geospatial information.

        Args:
            df: Raw DataFrame from API

        Returns:
            pd.DataFrame: Enhanced DataFrame with geospatial data
        """
        if df.empty:
            return df

        df["geometry"] = df.apply(
            lambda row: Point(row["longitude"], row["latitude"]), axis=1
        )
        self.logger.info(f"Created geometry for all {len(df)} records")

        return gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
fetch_locations(process_geospatial=False, **kwargs)

Fetch and process school locations.

Parameters:

Name Type Description Default
process_geospatial bool

Whether to process geospatial data and return a GeoDataFrame. Defaults to False.

False
**kwargs

Additional parameters for customization - page_size: Override default page size - sleep_time: Override default sleep time between requests - max_pages: Limit the number of pages to fetch

{}

Returns:

Type Description
Union[DataFrame, GeoDataFrame]

pd.DataFrame: School locations with geospatial info.

Source code in gigaspatial/handlers/giga.py
def fetch_locations(
    self, process_geospatial: bool = False, **kwargs
) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
    """
    Fetch and process school locations.

    Args:
        process_geospatial (bool): Whether to process geospatial data and return a GeoDataFrame. Defaults to False.
        **kwargs: Additional parameters for customization
            - page_size: Override default page size
            - sleep_time: Override default sleep time between requests
            - max_pages: Limit the number of pages to fetch

    Returns:
        pd.DataFrame: School locations with geospatial info.
    """
    # Override defaults with kwargs if provided
    page_size = kwargs.get("page_size", self.page_size)
    sleep_time = kwargs.get("sleep_time", self.sleep_time)
    max_pages = kwargs.get("max_pages", None)

    # Prepare headers
    headers = {
        "Authorization": f"Bearer {self.api_key}",
        "Accept": "application/json",
    }

    all_data = []
    page = 1

    self.logger.info(
        f"Starting to fetch school locations for country: {self.country}"
    )

    while True:
        # Check if we've reached max_pages limit
        if max_pages and page > max_pages:
            self.logger.info(f"Reached maximum pages limit: {max_pages}")
            break

        params = {"page": page, "size": page_size}

        try:
            self.logger.debug(f"Fetching page {page} with params: {params}")
            response = requests.get(self.api_url, headers=headers, params=params)
            response.raise_for_status()

            parsed = response.json()
            data = parsed.get("data", [])

        except requests.exceptions.RequestException as e:
            self.logger.error(f"Request failed on page {page}: {e}")
            break
        except ValueError as e:
            self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
            break

        # Check if we got any data
        if not data:
            self.logger.info(f"No data on page {page}. Stopping.")
            break

        all_data.extend(data)
        self.logger.info(f"Fetched page {page} with {len(data)} records")

        # If we got fewer records than page_size, we've reached the end
        if len(data) < page_size:
            self.logger.info("Reached end of data (partial page received)")
            break

        page += 1

        # Sleep to be respectful to the API
        if sleep_time > 0:
            time.sleep(sleep_time)

    self.logger.info(f"Finished fetching. Total records: {len(all_data)}")

    # Convert to DataFrame and process
    if not all_data:
        self.logger.warning("No data fetched, returning empty DataFrame")
        return pd.DataFrame()

    df = pd.DataFrame(all_data)

    if process_geospatial:
        df = self._process_geospatial_data(df)

    return df

GigaSchoolMeasurementsFetcher

Fetch and process school daily realtime connectivity measurements from the Giga API. This includes download/upload speeds, latency, and connectivity performance data.

Source code in gigaspatial/handlers/giga.py
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class GigaSchoolMeasurementsFetcher:
    """
    Fetch and process school daily realtime connectivity measurements from the Giga API.
    This includes download/upload speeds, latency, and connectivity performance data.
    """

    country: str = Field(...)
    start_date: Union[str, date, datetime] = Field(...)
    end_date: Union[str, date, datetime] = Field(...)
    api_url: str = Field(
        default="https://uni-ooi-giga-maps-service.azurewebsites.net/api/v1/all_measurements",
        description="Base URL for the Giga School Measurements API",
    )
    api_key: str = global_config.GIGA_SCHOOL_MEASUREMENTS_API_KEY
    page_size: int = Field(default=1000, description="Number of records per API page")
    sleep_time: float = Field(
        default=0.2, description="Sleep time between API requests"
    )
    giga_id_school: Optional[str] = Field(
        default=None, description="Optional specific giga school ID to fetch"
    )

    logger: logging.Logger = Field(default=None, repr=False)

    def __post_init__(self):
        try:
            self.country = pycountry.countries.lookup(self.country).alpha_3
        except LookupError:
            raise ValueError(f"Invalid country code provided: {self.country}")

        # Convert dates to string format if needed
        self.start_date = self._format_date(self.start_date)
        self.end_date = self._format_date(self.end_date)

        # Validate date range
        if self.start_date > self.end_date:
            raise ValueError("start_date must be before or equal to end_date")

        if self.logger is None:
            self.logger = global_config.get_logger(self.__class__.__name__)

    def _format_date(self, date_input: Union[str, date, datetime]) -> str:
        """
        Convert date input to string format expected by API (YYYY-MM-DD).

        Args:
            date_input: Date in various formats

        Returns:
            str: Date in YYYY-MM-DD format
        """
        if isinstance(date_input, str):
            # Assume it's already in correct format or parse it
            try:
                parsed_date = datetime.strptime(date_input, "%Y-%m-%d")
                return date_input
            except ValueError:
                try:
                    parsed_date = pd.to_datetime(date_input)
                    return parsed_date.strftime("%Y-%m-%d")
                except:
                    raise ValueError(
                        f"Invalid date format: {date_input}. Expected YYYY-MM-DD"
                    )
        elif isinstance(date_input, (date, datetime)):
            return date_input.strftime("%Y-%m-%d")
        else:
            raise ValueError(f"Invalid date type: {type(date_input)}")

    def fetch_measurements(self, **kwargs) -> pd.DataFrame:
        """
        Fetch and process school connectivity measurements.

        Args:
            **kwargs: Additional parameters for customization
                - page_size: Override default page size
                - sleep_time: Override default sleep time between requests
                - max_pages: Limit the number of pages to fetch
                - giga_id_school: Override default giga_id_school filter
                - start_date: Override default start_date
                - end_date: Override default end_date

        Returns:
            pd.DataFrame: School measurements with connectivity performance data.
        """
        # Override defaults with kwargs if provided
        page_size = kwargs.get("page_size", self.page_size)
        sleep_time = kwargs.get("sleep_time", self.sleep_time)
        max_pages = kwargs.get("max_pages", None)
        giga_id_school = kwargs.get("giga_id_school", self.giga_id_school)
        start_date = kwargs.get("start_date", self.start_date)
        end_date = kwargs.get("end_date", self.end_date)

        # Format dates if overridden
        if start_date != self.start_date:
            start_date = self._format_date(start_date)
        if end_date != self.end_date:
            end_date = self._format_date(end_date)

        # Prepare headers
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Accept": "application/json",
        }

        all_data = []
        page = 1

        self.logger.info(
            f"Starting to fetch measurements for country: {self.country} "
            f"from {start_date} to {end_date}"
        )

        if giga_id_school:
            self.logger.info(f"Filtering for specific school ID: {giga_id_school}")

        while True:
            # Check if we've reached max_pages limit
            if max_pages and page > max_pages:
                self.logger.info(f"Reached maximum pages limit: {max_pages}")
                break

            # Build parameters
            params = {
                "country_iso3_code": self.country,
                "start_date": start_date,
                "end_date": end_date,
                "page": page,
                "size": page_size,
            }

            # Add giga_id_school filter if specified
            if giga_id_school:
                params["giga_id_school"] = giga_id_school

            try:
                self.logger.debug(f"Fetching page {page} with params: {params}")
                response = requests.get(self.api_url, headers=headers, params=params)
                response.raise_for_status()

                parsed = response.json()
                data = parsed.get("data", [])

            except requests.exceptions.RequestException as e:
                self.logger.error(f"Request failed on page {page}: {e}")
                break
            except ValueError as e:
                self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
                break

            # Check if we got any data
            if not data:
                self.logger.info(f"No data on page {page}. Stopping.")
                break

            all_data.extend(data)
            self.logger.info(f"Fetched page {page} with {len(data)} records")

            # If we got fewer records than page_size, we've reached the end
            if len(data) < page_size:
                self.logger.info("Reached end of data (partial page received)")
                break

            # If filtering by specific school ID, we might only need one page
            if giga_id_school and len(all_data) > 0:
                self.logger.info(
                    "Specific school ID requested, checking if more data needed"
                )

            page += 1

            # Sleep to be respectful to the API
            if sleep_time > 0:
                time.sleep(sleep_time)

        self.logger.info(f"Finished fetching. Total records: {len(all_data)}")

        # Convert to DataFrame and process
        if not all_data:
            self.logger.warning("No data fetched, returning empty DataFrame")
            return pd.DataFrame()

        df = pd.DataFrame(all_data)
        df = self._process_measurements_data(df)

        return df

    def _process_measurements_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Process and enhance the DataFrame with measurement performance metrics.

        Args:
            df: Raw DataFrame from API

        Returns:
            pd.DataFrame: Enhanced DataFrame with processed measurement data
        """
        if df.empty:
            return df

        # Convert date column to datetime
        if "date" in df.columns:
            df["date"] = pd.to_datetime(df["date"], errors="coerce")
            df["date_only"] = df["date"].dt.date
            df["year"] = df["date"].dt.year
            df["month"] = df["date"].dt.month
            df["day_of_week"] = df["date"].dt.day_name()
            self.logger.info("Processed date fields")

        # Process speed measurements
        numeric_columns = ["download_speed", "upload_speed", "latency"]
        for col in numeric_columns:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors="coerce")

        # Create performance categories
        if "download_speed" in df.columns:
            df["download_speed_category"] = pd.cut(
                df["download_speed"],
                bins=[0, 5, 25, 100, float("inf")],
                labels=[
                    "Very Slow (<5 Mbps)",
                    "Slow (5-25 Mbps)",
                    "Moderate (25-100 Mbps)",
                    "Fast (>100 Mbps)",
                ],
                include_lowest=True,
            )

        if "upload_speed" in df.columns:
            df["upload_speed_category"] = pd.cut(
                df["upload_speed"],
                bins=[0, 1, 10, 50, float("inf")],
                labels=[
                    "Very Slow (<1 Mbps)",
                    "Slow (1-10 Mbps)",
                    "Moderate (10-50 Mbps)",
                    "Fast (>50 Mbps)",
                ],
                include_lowest=True,
            )

        if "latency" in df.columns:
            df["latency_category"] = pd.cut(
                df["latency"],
                bins=[0, 50, 150, 300, float("inf")],
                labels=[
                    "Excellent (<50ms)",
                    "Good (50-150ms)",
                    "Fair (150-300ms)",
                    "Poor (>300ms)",
                ],
                include_lowest=True,
            )

        # Create quality flags
        if "download_speed" in df.columns and "upload_speed" in df.columns:
            df["has_broadband"] = (df["download_speed"] >= 25) & (
                df["upload_speed"] >= 3
            )
            df["has_basic_connectivity"] = (df["download_speed"] >= 1) & (
                df["upload_speed"] >= 0.5
            )

        # Flag measurements with missing data
        df["has_complete_measurement"] = (
            df["download_speed"].notna()
            & df["upload_speed"].notna()
            & df["latency"].notna()
        )

        self.logger.info(f"Processed measurement data for {len(df)} records")

        return df

    def get_performance_summary(self, df: pd.DataFrame) -> dict:
        """
        Generate a comprehensive summary of connectivity performance metrics.

        Args:
            df: DataFrame with measurement data

        Returns:
            dict: Summary statistics about connectivity performance
        """
        if df.empty:
            return {"error": "No data available"}

        summary = {
            "total_measurements": len(df),
            "country": (
                df["country_iso3_code"].iloc[0]
                if "country_iso3_code" in df.columns
                else "Unknown"
            ),
            "date_range": {
                "start": (
                    df["date"].min().strftime("%Y-%m-%d")
                    if "date" in df.columns
                    else None
                ),
                "end": (
                    df["date"].max().strftime("%Y-%m-%d")
                    if "date" in df.columns
                    else None
                ),
            },
        }

        # School coverage
        if "giga_id_school" in df.columns:
            unique_schools = df["giga_id_school"].nunique()
            summary["unique_schools_measured"] = unique_schools
            summary["avg_measurements_per_school"] = (
                len(df) / unique_schools if unique_schools > 0 else 0
            )

        # Speed statistics
        for speed_col in ["download_speed", "upload_speed"]:
            if speed_col in df.columns:
                speed_data = df[speed_col].dropna()
                if len(speed_data) > 0:
                    summary[f"{speed_col}_stats"] = {
                        "mean": float(speed_data.mean()),
                        "median": float(speed_data.median()),
                        "min": float(speed_data.min()),
                        "max": float(speed_data.max()),
                        "std": float(speed_data.std()),
                    }

        # Latency statistics
        if "latency" in df.columns:
            latency_data = df["latency"].dropna()
            if len(latency_data) > 0:
                summary["latency_stats"] = {
                    "mean": float(latency_data.mean()),
                    "median": float(latency_data.median()),
                    "min": float(latency_data.min()),
                    "max": float(latency_data.max()),
                    "std": float(latency_data.std()),
                }

        # Performance categories
        for cat_col in [
            "download_speed_category",
            "upload_speed_category",
            "latency_category",
        ]:
            if cat_col in df.columns:
                cat_counts = df[cat_col].value_counts().to_dict()
                summary[cat_col.replace("_category", "_breakdown")] = cat_counts

        # Quality metrics
        if "has_broadband" in df.columns:
            summary["broadband_capable_measurements"] = int(df["has_broadband"].sum())
            summary["broadband_percentage"] = float(df["has_broadband"].mean() * 100)

        if "has_basic_connectivity" in df.columns:
            summary["basic_connectivity_measurements"] = int(
                df["has_basic_connectivity"].sum()
            )
            summary["basic_connectivity_percentage"] = float(
                df["has_basic_connectivity"].mean() * 100
            )

        # Data completeness
        if "has_complete_measurement" in df.columns:
            summary["complete_measurements"] = int(df["has_complete_measurement"].sum())
            summary["data_completeness_percentage"] = float(
                df["has_complete_measurement"].mean() * 100
            )

        # Data sources
        if "data_source" in df.columns:
            source_counts = df["data_source"].value_counts().to_dict()
            summary["data_sources"] = source_counts

        # Temporal patterns
        if "day_of_week" in df.columns:
            day_counts = df["day_of_week"].value_counts().to_dict()
            summary["measurements_by_day_of_week"] = day_counts

        self.logger.info("Generated performance summary")
        return summary

    def get_school_performance_comparison(
        self, df: pd.DataFrame, top_n: int = 10
    ) -> dict:
        """
        Compare performance across schools.

        Args:
            df: DataFrame with measurement data
            top_n: Number of top/bottom schools to include

        Returns:
            dict: School performance comparison
        """
        if df.empty or "giga_id_school" not in df.columns:
            return {"error": "No school data available"}

        school_stats = (
            df.groupby("giga_id_school")
            .agg(
                {
                    "download_speed": ["mean", "median", "count"],
                    "upload_speed": ["mean", "median"],
                    "latency": ["mean", "median"],
                    "has_broadband": (
                        "mean" if "has_broadband" in df.columns else lambda x: None
                    ),
                }
            )
            .round(2)
        )

        # Flatten column names
        school_stats.columns = ["_".join(col).strip() for col in school_stats.columns]

        # Sort by download speed
        if "download_speed_mean" in school_stats.columns:
            top_schools = school_stats.nlargest(top_n, "download_speed_mean")
            bottom_schools = school_stats.nsmallest(top_n, "download_speed_mean")

            return {
                "top_performing_schools": top_schools.to_dict("index"),
                "bottom_performing_schools": bottom_schools.to_dict("index"),
                "total_schools_analyzed": len(school_stats),
            }

        return {"error": "Insufficient data for school comparison"}
fetch_measurements(**kwargs)

Fetch and process school connectivity measurements.

Parameters:

Name Type Description Default
**kwargs

Additional parameters for customization - page_size: Override default page size - sleep_time: Override default sleep time between requests - max_pages: Limit the number of pages to fetch - giga_id_school: Override default giga_id_school filter - start_date: Override default start_date - end_date: Override default end_date

{}

Returns:

Type Description
DataFrame

pd.DataFrame: School measurements with connectivity performance data.

Source code in gigaspatial/handlers/giga.py
def fetch_measurements(self, **kwargs) -> pd.DataFrame:
    """
    Fetch and process school connectivity measurements.

    Args:
        **kwargs: Additional parameters for customization
            - page_size: Override default page size
            - sleep_time: Override default sleep time between requests
            - max_pages: Limit the number of pages to fetch
            - giga_id_school: Override default giga_id_school filter
            - start_date: Override default start_date
            - end_date: Override default end_date

    Returns:
        pd.DataFrame: School measurements with connectivity performance data.
    """
    # Override defaults with kwargs if provided
    page_size = kwargs.get("page_size", self.page_size)
    sleep_time = kwargs.get("sleep_time", self.sleep_time)
    max_pages = kwargs.get("max_pages", None)
    giga_id_school = kwargs.get("giga_id_school", self.giga_id_school)
    start_date = kwargs.get("start_date", self.start_date)
    end_date = kwargs.get("end_date", self.end_date)

    # Format dates if overridden
    if start_date != self.start_date:
        start_date = self._format_date(start_date)
    if end_date != self.end_date:
        end_date = self._format_date(end_date)

    # Prepare headers
    headers = {
        "Authorization": f"Bearer {self.api_key}",
        "Accept": "application/json",
    }

    all_data = []
    page = 1

    self.logger.info(
        f"Starting to fetch measurements for country: {self.country} "
        f"from {start_date} to {end_date}"
    )

    if giga_id_school:
        self.logger.info(f"Filtering for specific school ID: {giga_id_school}")

    while True:
        # Check if we've reached max_pages limit
        if max_pages and page > max_pages:
            self.logger.info(f"Reached maximum pages limit: {max_pages}")
            break

        # Build parameters
        params = {
            "country_iso3_code": self.country,
            "start_date": start_date,
            "end_date": end_date,
            "page": page,
            "size": page_size,
        }

        # Add giga_id_school filter if specified
        if giga_id_school:
            params["giga_id_school"] = giga_id_school

        try:
            self.logger.debug(f"Fetching page {page} with params: {params}")
            response = requests.get(self.api_url, headers=headers, params=params)
            response.raise_for_status()

            parsed = response.json()
            data = parsed.get("data", [])

        except requests.exceptions.RequestException as e:
            self.logger.error(f"Request failed on page {page}: {e}")
            break
        except ValueError as e:
            self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
            break

        # Check if we got any data
        if not data:
            self.logger.info(f"No data on page {page}. Stopping.")
            break

        all_data.extend(data)
        self.logger.info(f"Fetched page {page} with {len(data)} records")

        # If we got fewer records than page_size, we've reached the end
        if len(data) < page_size:
            self.logger.info("Reached end of data (partial page received)")
            break

        # If filtering by specific school ID, we might only need one page
        if giga_id_school and len(all_data) > 0:
            self.logger.info(
                "Specific school ID requested, checking if more data needed"
            )

        page += 1

        # Sleep to be respectful to the API
        if sleep_time > 0:
            time.sleep(sleep_time)

    self.logger.info(f"Finished fetching. Total records: {len(all_data)}")

    # Convert to DataFrame and process
    if not all_data:
        self.logger.warning("No data fetched, returning empty DataFrame")
        return pd.DataFrame()

    df = pd.DataFrame(all_data)
    df = self._process_measurements_data(df)

    return df
get_performance_summary(df)

Generate a comprehensive summary of connectivity performance metrics.

Parameters:

Name Type Description Default
df DataFrame

DataFrame with measurement data

required

Returns:

Name Type Description
dict dict

Summary statistics about connectivity performance

Source code in gigaspatial/handlers/giga.py
def get_performance_summary(self, df: pd.DataFrame) -> dict:
    """
    Generate a comprehensive summary of connectivity performance metrics.

    Args:
        df: DataFrame with measurement data

    Returns:
        dict: Summary statistics about connectivity performance
    """
    if df.empty:
        return {"error": "No data available"}

    summary = {
        "total_measurements": len(df),
        "country": (
            df["country_iso3_code"].iloc[0]
            if "country_iso3_code" in df.columns
            else "Unknown"
        ),
        "date_range": {
            "start": (
                df["date"].min().strftime("%Y-%m-%d")
                if "date" in df.columns
                else None
            ),
            "end": (
                df["date"].max().strftime("%Y-%m-%d")
                if "date" in df.columns
                else None
            ),
        },
    }

    # School coverage
    if "giga_id_school" in df.columns:
        unique_schools = df["giga_id_school"].nunique()
        summary["unique_schools_measured"] = unique_schools
        summary["avg_measurements_per_school"] = (
            len(df) / unique_schools if unique_schools > 0 else 0
        )

    # Speed statistics
    for speed_col in ["download_speed", "upload_speed"]:
        if speed_col in df.columns:
            speed_data = df[speed_col].dropna()
            if len(speed_data) > 0:
                summary[f"{speed_col}_stats"] = {
                    "mean": float(speed_data.mean()),
                    "median": float(speed_data.median()),
                    "min": float(speed_data.min()),
                    "max": float(speed_data.max()),
                    "std": float(speed_data.std()),
                }

    # Latency statistics
    if "latency" in df.columns:
        latency_data = df["latency"].dropna()
        if len(latency_data) > 0:
            summary["latency_stats"] = {
                "mean": float(latency_data.mean()),
                "median": float(latency_data.median()),
                "min": float(latency_data.min()),
                "max": float(latency_data.max()),
                "std": float(latency_data.std()),
            }

    # Performance categories
    for cat_col in [
        "download_speed_category",
        "upload_speed_category",
        "latency_category",
    ]:
        if cat_col in df.columns:
            cat_counts = df[cat_col].value_counts().to_dict()
            summary[cat_col.replace("_category", "_breakdown")] = cat_counts

    # Quality metrics
    if "has_broadband" in df.columns:
        summary["broadband_capable_measurements"] = int(df["has_broadband"].sum())
        summary["broadband_percentage"] = float(df["has_broadband"].mean() * 100)

    if "has_basic_connectivity" in df.columns:
        summary["basic_connectivity_measurements"] = int(
            df["has_basic_connectivity"].sum()
        )
        summary["basic_connectivity_percentage"] = float(
            df["has_basic_connectivity"].mean() * 100
        )

    # Data completeness
    if "has_complete_measurement" in df.columns:
        summary["complete_measurements"] = int(df["has_complete_measurement"].sum())
        summary["data_completeness_percentage"] = float(
            df["has_complete_measurement"].mean() * 100
        )

    # Data sources
    if "data_source" in df.columns:
        source_counts = df["data_source"].value_counts().to_dict()
        summary["data_sources"] = source_counts

    # Temporal patterns
    if "day_of_week" in df.columns:
        day_counts = df["day_of_week"].value_counts().to_dict()
        summary["measurements_by_day_of_week"] = day_counts

    self.logger.info("Generated performance summary")
    return summary
get_school_performance_comparison(df, top_n=10)

Compare performance across schools.

Parameters:

Name Type Description Default
df DataFrame

DataFrame with measurement data

required
top_n int

Number of top/bottom schools to include

10

Returns:

Name Type Description
dict dict

School performance comparison

Source code in gigaspatial/handlers/giga.py
def get_school_performance_comparison(
    self, df: pd.DataFrame, top_n: int = 10
) -> dict:
    """
    Compare performance across schools.

    Args:
        df: DataFrame with measurement data
        top_n: Number of top/bottom schools to include

    Returns:
        dict: School performance comparison
    """
    if df.empty or "giga_id_school" not in df.columns:
        return {"error": "No school data available"}

    school_stats = (
        df.groupby("giga_id_school")
        .agg(
            {
                "download_speed": ["mean", "median", "count"],
                "upload_speed": ["mean", "median"],
                "latency": ["mean", "median"],
                "has_broadband": (
                    "mean" if "has_broadband" in df.columns else lambda x: None
                ),
            }
        )
        .round(2)
    )

    # Flatten column names
    school_stats.columns = ["_".join(col).strip() for col in school_stats.columns]

    # Sort by download speed
    if "download_speed_mean" in school_stats.columns:
        top_schools = school_stats.nlargest(top_n, "download_speed_mean")
        bottom_schools = school_stats.nsmallest(top_n, "download_speed_mean")

        return {
            "top_performing_schools": top_schools.to_dict("index"),
            "bottom_performing_schools": bottom_schools.to_dict("index"),
            "total_schools_analyzed": len(school_stats),
        }

    return {"error": "Insufficient data for school comparison"}

GigaSchoolProfileFetcher

Fetch and process school profile data from the Giga School Profile API. This includes connectivity information and other school details.

Source code in gigaspatial/handlers/giga.py
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class GigaSchoolProfileFetcher:
    """
    Fetch and process school profile data from the Giga School Profile API.
    This includes connectivity information and other school details.
    """

    country: str = Field(...)
    api_url: str = Field(
        default="https://uni-ooi-giga-maps-service.azurewebsites.net/api/v1/schools_profile/",
        description="Base URL for the Giga School Profile API",
    )
    api_key: str = global_config.GIGA_SCHOOL_PROFILE_API_KEY
    page_size: int = Field(default=1000, description="Number of records per API page")
    sleep_time: float = Field(
        default=0.2, description="Sleep time between API requests"
    )
    giga_id_school: Optional[str] = Field(
        default=None, description="Optional specific giga school ID to fetch"
    )

    logger: logging.Logger = Field(default=None, repr=False)

    def __post_init__(self):
        try:
            self.country = pycountry.countries.lookup(self.country).alpha_3
        except LookupError:
            raise ValueError(f"Invalid country code provided: {self.country}")

        if self.logger is None:
            self.logger = global_config.get_logger(self.__class__.__name__)

    def fetch_profiles(self, **kwargs) -> pd.DataFrame:
        """
        Fetch and process school profiles including connectivity information.

        Args:
            **kwargs: Additional parameters for customization
                - page_size: Override default page size
                - sleep_time: Override default sleep time between requests
                - max_pages: Limit the number of pages to fetch
                - giga_id_school: Override default giga_id_school filter

        Returns:
            pd.DataFrame: School profiles with connectivity and geospatial info.
        """
        # Override defaults with kwargs if provided
        page_size = kwargs.get("page_size", self.page_size)
        sleep_time = kwargs.get("sleep_time", self.sleep_time)
        max_pages = kwargs.get("max_pages", None)
        giga_id_school = kwargs.get("giga_id_school", self.giga_id_school)

        # Prepare headers
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Accept": "application/json",
        }

        all_data = []
        page = 1

        self.logger.info(
            f"Starting to fetch school profiles for country: {self.country}"
        )

        if giga_id_school:
            self.logger.info(f"Filtering for specific school ID: {giga_id_school}")

        while True:
            # Check if we've reached max_pages limit
            if max_pages and page > max_pages:
                self.logger.info(f"Reached maximum pages limit: {max_pages}")
                break

            # Build parameters
            params = {
                "country_iso3_code": self.country,
                "page": page,
                "size": page_size,
            }

            # Add giga_id_school filter if specified
            if giga_id_school:
                params["giga_id_school"] = giga_id_school

            try:
                self.logger.debug(f"Fetching page {page} with params: {params}")
                response = requests.get(self.api_url, headers=headers, params=params)
                response.raise_for_status()

                parsed = response.json()
                data = parsed.get("data", [])

            except requests.exceptions.RequestException as e:
                self.logger.error(f"Request failed on page {page}: {e}")
                break
            except ValueError as e:
                self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
                break

            # Check if we got any data
            if not data:
                self.logger.info(f"No data on page {page}. Stopping.")
                break

            all_data.extend(data)
            self.logger.info(f"Fetched page {page} with {len(data)} records")

            # If we got fewer records than page_size, we've reached the end
            if len(data) < page_size:
                self.logger.info("Reached end of data (partial page received)")
                break

            # If filtering by specific school ID, we likely only need one page
            if giga_id_school:
                self.logger.info(
                    "Specific school ID requested, stopping after first page"
                )
                break

            page += 1

            # Sleep to be respectful to the API
            if sleep_time > 0:
                time.sleep(sleep_time)

        self.logger.info(f"Finished fetching. Total records: {len(all_data)}")

        # Convert to DataFrame and process
        if not all_data:
            self.logger.warning("No data fetched, returning empty DataFrame")
            return pd.DataFrame()

        df = pd.DataFrame(all_data)

        return df

    def get_connectivity_summary(self, df: pd.DataFrame) -> dict:
        """
        Generate a summary of connectivity statistics from the fetched data.

        Args:
            df: DataFrame with school profile data

        Returns:
            dict: Summary statistics about connectivity
        """
        if df.empty:
            return {"error": "No data available"}

        summary = {
            "total_schools": len(df),
            "country": (
                df["country_iso3_code"].iloc[0]
                if "country_iso3_code" in df.columns
                else "Unknown"
            ),
        }

        # Administrative region analysis
        if "admin1" in df.columns:
            admin1_counts = df["admin1"].value_counts().head(10).to_dict()
            summary["top_admin1_regions"] = admin1_counts

        if "admin2" in df.columns:
            admin2_counts = df["admin2"].value_counts().head(10).to_dict()
            summary["top_admin2_regions"] = admin2_counts

        # Connectivity analysis
        if "connectivity" in df.columns:
            connected_count = df["connectivity"].sum()
            summary["schools_with_connectivity"] = int(connected_count)
            summary["connectivity_percentage"] = connected_count / len(df) * 100

        if "connectivity_RT" in df.columns:
            rt_connected_count = df["connectivity_RT"].sum()
            summary["schools_with_realtime_connectivity"] = int(rt_connected_count)
            summary["realtime_connectivity_percentage"] = (
                rt_connected_count / len(df) * 100
            )

        # Connectivity type analysis
        if "connectivity_type" in df.columns:

            if not all(df.connectivity_type.isna()):
                from collections import Counter

                type_counts = dict(Counter(df.connectivity_type.dropna().to_list()))
                summary["connectivity_types_breakdown"] = type_counts

        # Data source analysis
        if "connectivity_RT_datasource" in df.columns:
            datasource_counts = (
                df["connectivity_RT_datasource"].value_counts().to_dict()
            )
            summary["realtime_connectivity_datasources"] = datasource_counts

        if "school_data_source" in df.columns:
            school_datasource_counts = df["school_data_source"].value_counts().to_dict()
            summary["school_data_sources"] = school_datasource_counts

        self.logger.info("Generated connectivity summary")
        return summary
fetch_profiles(**kwargs)

Fetch and process school profiles including connectivity information.

Parameters:

Name Type Description Default
**kwargs

Additional parameters for customization - page_size: Override default page size - sleep_time: Override default sleep time between requests - max_pages: Limit the number of pages to fetch - giga_id_school: Override default giga_id_school filter

{}

Returns:

Type Description
DataFrame

pd.DataFrame: School profiles with connectivity and geospatial info.

Source code in gigaspatial/handlers/giga.py
def fetch_profiles(self, **kwargs) -> pd.DataFrame:
    """
    Fetch and process school profiles including connectivity information.

    Args:
        **kwargs: Additional parameters for customization
            - page_size: Override default page size
            - sleep_time: Override default sleep time between requests
            - max_pages: Limit the number of pages to fetch
            - giga_id_school: Override default giga_id_school filter

    Returns:
        pd.DataFrame: School profiles with connectivity and geospatial info.
    """
    # Override defaults with kwargs if provided
    page_size = kwargs.get("page_size", self.page_size)
    sleep_time = kwargs.get("sleep_time", self.sleep_time)
    max_pages = kwargs.get("max_pages", None)
    giga_id_school = kwargs.get("giga_id_school", self.giga_id_school)

    # Prepare headers
    headers = {
        "Authorization": f"Bearer {self.api_key}",
        "Accept": "application/json",
    }

    all_data = []
    page = 1

    self.logger.info(
        f"Starting to fetch school profiles for country: {self.country}"
    )

    if giga_id_school:
        self.logger.info(f"Filtering for specific school ID: {giga_id_school}")

    while True:
        # Check if we've reached max_pages limit
        if max_pages and page > max_pages:
            self.logger.info(f"Reached maximum pages limit: {max_pages}")
            break

        # Build parameters
        params = {
            "country_iso3_code": self.country,
            "page": page,
            "size": page_size,
        }

        # Add giga_id_school filter if specified
        if giga_id_school:
            params["giga_id_school"] = giga_id_school

        try:
            self.logger.debug(f"Fetching page {page} with params: {params}")
            response = requests.get(self.api_url, headers=headers, params=params)
            response.raise_for_status()

            parsed = response.json()
            data = parsed.get("data", [])

        except requests.exceptions.RequestException as e:
            self.logger.error(f"Request failed on page {page}: {e}")
            break
        except ValueError as e:
            self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
            break

        # Check if we got any data
        if not data:
            self.logger.info(f"No data on page {page}. Stopping.")
            break

        all_data.extend(data)
        self.logger.info(f"Fetched page {page} with {len(data)} records")

        # If we got fewer records than page_size, we've reached the end
        if len(data) < page_size:
            self.logger.info("Reached end of data (partial page received)")
            break

        # If filtering by specific school ID, we likely only need one page
        if giga_id_school:
            self.logger.info(
                "Specific school ID requested, stopping after first page"
            )
            break

        page += 1

        # Sleep to be respectful to the API
        if sleep_time > 0:
            time.sleep(sleep_time)

    self.logger.info(f"Finished fetching. Total records: {len(all_data)}")

    # Convert to DataFrame and process
    if not all_data:
        self.logger.warning("No data fetched, returning empty DataFrame")
        return pd.DataFrame()

    df = pd.DataFrame(all_data)

    return df
get_connectivity_summary(df)

Generate a summary of connectivity statistics from the fetched data.

Parameters:

Name Type Description Default
df DataFrame

DataFrame with school profile data

required

Returns:

Name Type Description
dict dict

Summary statistics about connectivity

Source code in gigaspatial/handlers/giga.py
def get_connectivity_summary(self, df: pd.DataFrame) -> dict:
    """
    Generate a summary of connectivity statistics from the fetched data.

    Args:
        df: DataFrame with school profile data

    Returns:
        dict: Summary statistics about connectivity
    """
    if df.empty:
        return {"error": "No data available"}

    summary = {
        "total_schools": len(df),
        "country": (
            df["country_iso3_code"].iloc[0]
            if "country_iso3_code" in df.columns
            else "Unknown"
        ),
    }

    # Administrative region analysis
    if "admin1" in df.columns:
        admin1_counts = df["admin1"].value_counts().head(10).to_dict()
        summary["top_admin1_regions"] = admin1_counts

    if "admin2" in df.columns:
        admin2_counts = df["admin2"].value_counts().head(10).to_dict()
        summary["top_admin2_regions"] = admin2_counts

    # Connectivity analysis
    if "connectivity" in df.columns:
        connected_count = df["connectivity"].sum()
        summary["schools_with_connectivity"] = int(connected_count)
        summary["connectivity_percentage"] = connected_count / len(df) * 100

    if "connectivity_RT" in df.columns:
        rt_connected_count = df["connectivity_RT"].sum()
        summary["schools_with_realtime_connectivity"] = int(rt_connected_count)
        summary["realtime_connectivity_percentage"] = (
            rt_connected_count / len(df) * 100
        )

    # Connectivity type analysis
    if "connectivity_type" in df.columns:

        if not all(df.connectivity_type.isna()):
            from collections import Counter

            type_counts = dict(Counter(df.connectivity_type.dropna().to_list()))
            summary["connectivity_types_breakdown"] = type_counts

    # Data source analysis
    if "connectivity_RT_datasource" in df.columns:
        datasource_counts = (
            df["connectivity_RT_datasource"].value_counts().to_dict()
        )
        summary["realtime_connectivity_datasources"] = datasource_counts

    if "school_data_source" in df.columns:
        school_datasource_counts = df["school_data_source"].value_counts().to_dict()
        summary["school_data_sources"] = school_datasource_counts

    self.logger.info("Generated connectivity summary")
    return summary

google_open_buildings

GoogleOpenBuildingsConfig dataclass

Bases: BaseHandlerConfig

Configuration for Google Open Buildings dataset files. Implements the BaseHandlerConfig interface for data unit resolution.

Source code in gigaspatial/handlers/google_open_buildings.py
@dataclass
class GoogleOpenBuildingsConfig(BaseHandlerConfig):
    """
    Configuration for Google Open Buildings dataset files.
    Implements the BaseHandlerConfig interface for data unit resolution.
    """

    TILES_URL: str = (
        "https://openbuildings-public-dot-gweb-research.uw.r.appspot.com/public/tiles.geojson"
    )
    base_path: Path = global_config.get_path("google_open_buildings", "bronze")
    data_types: tuple = ("polygons", "points")

    def __post_init__(self):
        super().__post_init__()
        self._load_s2_tiles()

    def _load_s2_tiles(self):
        """Load S2 tiles from GeoJSON file."""
        response = requests.get(self.TILES_URL)
        response.raise_for_status()
        self.tiles_gdf = gpd.GeoDataFrame.from_features(
            response.json()["features"], crs="EPSG:4326"
        )

    def get_relevant_data_units(self, source, force_recompute: bool = False, **kwargs):
        return super().get_relevant_data_units(
            source, force_recompute, crs="EPSG:4326", **kwargs
        )

    def get_relevant_data_units_by_geometry(
        self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
    ) -> List[dict]:
        """
        Return intersecting tiles for a given geometry or GeoDataFrame.
        """
        mask = (tile_geom.intersects(geometry) for tile_geom in self.tiles_gdf.geometry)
        return self.tiles_gdf.loc[mask, ["tile_id", "tile_url", "size_mb"]].to_dict(
            "records"
        )

    def get_data_unit_path(
        self,
        unit: Union[pd.Series, dict, str],
        data_type: str = "polygons",
        **kwargs,
    ) -> Path:
        """
        Given a tile row or tile_id, return the corresponding file path.
        """
        tile_id = (
            unit["tile_id"]
            if isinstance(unit, pd.Series) or isinstance(unit, dict)
            else unit
        )
        return self.base_path / f"{data_type}_s2_level_4_{tile_id}_buildings.csv.gz"

    def get_data_unit_paths(
        self,
        units: Union[pd.DataFrame, Iterable[Union[dict, str]]],
        data_type: str = "polygons",
        **kwargs,
    ) -> list:
        """
        Given data unit identifiers, return the corresponding file paths.
        """
        if isinstance(units, pd.DataFrame):
            return [
                self.get_data_unit_path(row, data_type=data_type, **kwargs)
                for _, row in units.iterrows()
            ]
        return super().get_data_unit_paths(units, data_type=data_type)
get_data_unit_path(unit, data_type='polygons', **kwargs)

Given a tile row or tile_id, return the corresponding file path.

Source code in gigaspatial/handlers/google_open_buildings.py
def get_data_unit_path(
    self,
    unit: Union[pd.Series, dict, str],
    data_type: str = "polygons",
    **kwargs,
) -> Path:
    """
    Given a tile row or tile_id, return the corresponding file path.
    """
    tile_id = (
        unit["tile_id"]
        if isinstance(unit, pd.Series) or isinstance(unit, dict)
        else unit
    )
    return self.base_path / f"{data_type}_s2_level_4_{tile_id}_buildings.csv.gz"
get_data_unit_paths(units, data_type='polygons', **kwargs)

Given data unit identifiers, return the corresponding file paths.

Source code in gigaspatial/handlers/google_open_buildings.py
def get_data_unit_paths(
    self,
    units: Union[pd.DataFrame, Iterable[Union[dict, str]]],
    data_type: str = "polygons",
    **kwargs,
) -> list:
    """
    Given data unit identifiers, return the corresponding file paths.
    """
    if isinstance(units, pd.DataFrame):
        return [
            self.get_data_unit_path(row, data_type=data_type, **kwargs)
            for _, row in units.iterrows()
        ]
    return super().get_data_unit_paths(units, data_type=data_type)
get_relevant_data_units_by_geometry(geometry, **kwargs)

Return intersecting tiles for a given geometry or GeoDataFrame.

Source code in gigaspatial/handlers/google_open_buildings.py
def get_relevant_data_units_by_geometry(
    self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
) -> List[dict]:
    """
    Return intersecting tiles for a given geometry or GeoDataFrame.
    """
    mask = (tile_geom.intersects(geometry) for tile_geom in self.tiles_gdf.geometry)
    return self.tiles_gdf.loc[mask, ["tile_id", "tile_url", "size_mb"]].to_dict(
        "records"
    )

GoogleOpenBuildingsDownloader

Bases: BaseHandlerDownloader

A class to handle downloads of Google's Open Buildings dataset.

Source code in gigaspatial/handlers/google_open_buildings.py
class GoogleOpenBuildingsDownloader(BaseHandlerDownloader):
    """A class to handle downloads of Google's Open Buildings dataset."""

    def __init__(
        self,
        config: Optional[GoogleOpenBuildingsConfig] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        """
        Initialize the downloader.

        Args:
            config: Optional configuration for file paths and download settings.
                    If None, a default `GoogleOpenBuildingsConfig` is used.
            data_store: Optional instance of a `DataStore` for managing data
                        storage. If None, a `LocalDataStore` is used.
            logger: Optional custom logger instance. If None, a default logger
                    named after the module is created and used.
        """
        config = config or GoogleOpenBuildingsConfig()
        super().__init__(config=config, data_store=data_store, logger=logger)

    def download_data_unit(
        self,
        tile_info: Union[pd.Series, dict],
        data_type: Literal["polygons", "points"] = "polygons",
    ) -> Optional[str]:
        """
        Download data file for a single tile.

        data_type: The type of building data to download ('polygons' or 'points').
            Defaults to 'polygons'.
        """

        tile_url = tile_info["tile_url"]
        if data_type == "points":
            tile_url = tile_url.replace("polygons", "points")

        try:
            response = requests.get(tile_url, stream=True)
            response.raise_for_status()

            file_path = str(
                self.config.get_data_unit_path(
                    tile_info["tile_id"], data_type=data_type
                )
            )

            with self.data_store.open(file_path, "wb") as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)

                self.logger.debug(
                    f"Successfully downloaded tile: {tile_info['tile_id']}"
                )
                return file_path

        except requests.exceptions.RequestException as e:
            self.logger.error(
                f"Failed to download tile {tile_info['tile_id']}: {str(e)}"
            )
            return None
        except Exception as e:
            self.logger.error(f"Unexpected error downloading dataset: {str(e)}")
            return None

    def download_data_units(
        self,
        tiles: Union[pd.DataFrame, List[dict]],
        data_type: Literal["polygons", "points"] = "polygons",
    ) -> List[str]:
        """
        Download data files for multiple tiles.

        data_type: The type of building data to download ('polygons' or 'points').
            Defaults to 'polygons'.
        """

        if len(tiles) == 0:
            self.logger.warning(f"There is no matching data")
            return []

        with multiprocessing.Pool(self.config.n_workers) as pool:
            download_func = functools.partial(
                self.download_data_unit, data_type=data_type
            )
            file_paths = list(
                tqdm(
                    pool.imap(
                        download_func,
                        (
                            [row for _, row in tiles.iterrows()]
                            if isinstance(tiles, pd.DataFrame)
                            else tiles
                        ),
                    ),
                    total=len(tiles),
                    desc=f"Downloading {data_type} data",
                )
            )

        return [path for path in file_paths if path is not None]

    def download_by_country(
        self,
        country: str,
        data_type: Literal["polygons", "points"] = "polygons",
        data_store: Optional[DataStore] = None,
        country_geom_path: Optional[Union[str, Path]] = None,
    ) -> List[str]:
        """
        Download Google Open Buildings data for a specific country.

        This is a convenience method to download data for an entire country
        using its code or name.

        Args:
            country: The country code (e.g., 'USA', 'GBR') or name.
            data_type: The type of building data to download ('polygons' or 'points').
                       Defaults to 'polygons'.
            data_store: Optional instance of a `DataStore` to be used by
                        `AdminBoundaries` for loading country boundaries. If None,
                        `AdminBoundaries` will use its default data loading.
            country_geom_path: Optional path to a GeoJSON file containing the
                               country boundary. If provided, this boundary is used
                               instead of the default from `AdminBoundaries`.

        Returns:
            A list of local file paths for the successfully downloaded tiles
            for the specified country.
        """
        return self.download(
            source=country,
            data_type=data_type,
            data_store=data_store,
            path=country_geom_path,
        )
__init__(config=None, data_store=None, logger=None)

Initialize the downloader.

Parameters:

Name Type Description Default
config Optional[GoogleOpenBuildingsConfig]

Optional configuration for file paths and download settings. If None, a default GoogleOpenBuildingsConfig is used.

None
data_store Optional[DataStore]

Optional instance of a DataStore for managing data storage. If None, a LocalDataStore is used.

None
logger Optional[Logger]

Optional custom logger instance. If None, a default logger named after the module is created and used.

None
Source code in gigaspatial/handlers/google_open_buildings.py
def __init__(
    self,
    config: Optional[GoogleOpenBuildingsConfig] = None,
    data_store: Optional[DataStore] = None,
    logger: Optional[logging.Logger] = None,
):
    """
    Initialize the downloader.

    Args:
        config: Optional configuration for file paths and download settings.
                If None, a default `GoogleOpenBuildingsConfig` is used.
        data_store: Optional instance of a `DataStore` for managing data
                    storage. If None, a `LocalDataStore` is used.
        logger: Optional custom logger instance. If None, a default logger
                named after the module is created and used.
    """
    config = config or GoogleOpenBuildingsConfig()
    super().__init__(config=config, data_store=data_store, logger=logger)
download_by_country(country, data_type='polygons', data_store=None, country_geom_path=None)

Download Google Open Buildings data for a specific country.

This is a convenience method to download data for an entire country using its code or name.

Parameters:

Name Type Description Default
country str

The country code (e.g., 'USA', 'GBR') or name.

required
data_type Literal['polygons', 'points']

The type of building data to download ('polygons' or 'points'). Defaults to 'polygons'.

'polygons'
data_store Optional[DataStore]

Optional instance of a DataStore to be used by AdminBoundaries for loading country boundaries. If None, AdminBoundaries will use its default data loading.

None
country_geom_path Optional[Union[str, Path]]

Optional path to a GeoJSON file containing the country boundary. If provided, this boundary is used instead of the default from AdminBoundaries.

None

Returns:

Type Description
List[str]

A list of local file paths for the successfully downloaded tiles

List[str]

for the specified country.

Source code in gigaspatial/handlers/google_open_buildings.py
def download_by_country(
    self,
    country: str,
    data_type: Literal["polygons", "points"] = "polygons",
    data_store: Optional[DataStore] = None,
    country_geom_path: Optional[Union[str, Path]] = None,
) -> List[str]:
    """
    Download Google Open Buildings data for a specific country.

    This is a convenience method to download data for an entire country
    using its code or name.

    Args:
        country: The country code (e.g., 'USA', 'GBR') or name.
        data_type: The type of building data to download ('polygons' or 'points').
                   Defaults to 'polygons'.
        data_store: Optional instance of a `DataStore` to be used by
                    `AdminBoundaries` for loading country boundaries. If None,
                    `AdminBoundaries` will use its default data loading.
        country_geom_path: Optional path to a GeoJSON file containing the
                           country boundary. If provided, this boundary is used
                           instead of the default from `AdminBoundaries`.

    Returns:
        A list of local file paths for the successfully downloaded tiles
        for the specified country.
    """
    return self.download(
        source=country,
        data_type=data_type,
        data_store=data_store,
        path=country_geom_path,
    )
download_data_unit(tile_info, data_type='polygons')

Download data file for a single tile.

The type of building data to download ('polygons' or 'points').

Defaults to 'polygons'.

Source code in gigaspatial/handlers/google_open_buildings.py
def download_data_unit(
    self,
    tile_info: Union[pd.Series, dict],
    data_type: Literal["polygons", "points"] = "polygons",
) -> Optional[str]:
    """
    Download data file for a single tile.

    data_type: The type of building data to download ('polygons' or 'points').
        Defaults to 'polygons'.
    """

    tile_url = tile_info["tile_url"]
    if data_type == "points":
        tile_url = tile_url.replace("polygons", "points")

    try:
        response = requests.get(tile_url, stream=True)
        response.raise_for_status()

        file_path = str(
            self.config.get_data_unit_path(
                tile_info["tile_id"], data_type=data_type
            )
        )

        with self.data_store.open(file_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)

            self.logger.debug(
                f"Successfully downloaded tile: {tile_info['tile_id']}"
            )
            return file_path

    except requests.exceptions.RequestException as e:
        self.logger.error(
            f"Failed to download tile {tile_info['tile_id']}: {str(e)}"
        )
        return None
    except Exception as e:
        self.logger.error(f"Unexpected error downloading dataset: {str(e)}")
        return None
download_data_units(tiles, data_type='polygons')

Download data files for multiple tiles.

The type of building data to download ('polygons' or 'points').

Defaults to 'polygons'.

Source code in gigaspatial/handlers/google_open_buildings.py
def download_data_units(
    self,
    tiles: Union[pd.DataFrame, List[dict]],
    data_type: Literal["polygons", "points"] = "polygons",
) -> List[str]:
    """
    Download data files for multiple tiles.

    data_type: The type of building data to download ('polygons' or 'points').
        Defaults to 'polygons'.
    """

    if len(tiles) == 0:
        self.logger.warning(f"There is no matching data")
        return []

    with multiprocessing.Pool(self.config.n_workers) as pool:
        download_func = functools.partial(
            self.download_data_unit, data_type=data_type
        )
        file_paths = list(
            tqdm(
                pool.imap(
                    download_func,
                    (
                        [row for _, row in tiles.iterrows()]
                        if isinstance(tiles, pd.DataFrame)
                        else tiles
                    ),
                ),
                total=len(tiles),
                desc=f"Downloading {data_type} data",
            )
        )

    return [path for path in file_paths if path is not None]

GoogleOpenBuildingsHandler

Bases: BaseHandler

Handler for Google Open Buildings dataset.

This class provides a unified interface for downloading and loading Google Open Buildings data. It manages the lifecycle of configuration, downloading, and reading components.

Source code in gigaspatial/handlers/google_open_buildings.py
class GoogleOpenBuildingsHandler(BaseHandler):
    """
    Handler for Google Open Buildings dataset.

    This class provides a unified interface for downloading and loading Google Open Buildings data.
    It manages the lifecycle of configuration, downloading, and reading components.
    """

    def create_config(
        self, data_store: DataStore, logger: logging.Logger, **kwargs
    ) -> GoogleOpenBuildingsConfig:
        """
        Create and return a GoogleOpenBuildingsConfig instance.

        Args:
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional configuration parameters

        Returns:
            Configured GoogleOpenBuildingsConfig instance
        """
        return GoogleOpenBuildingsConfig(data_store=data_store, logger=logger, **kwargs)

    def create_downloader(
        self,
        config: GoogleOpenBuildingsConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> GoogleOpenBuildingsDownloader:
        """
        Create and return a GoogleOpenBuildingsDownloader instance.

        Args:
            config: The configuration object
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional downloader parameters

        Returns:
            Configured GoogleOpenBuildingsDownloader instance
        """
        return GoogleOpenBuildingsDownloader(
            config=config, data_store=data_store, logger=logger, **kwargs
        )

    def create_reader(
        self,
        config: GoogleOpenBuildingsConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> GoogleOpenBuildingsReader:
        """
        Create and return a GoogleOpenBuildingsReader instance.

        Args:
            config: The configuration object
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional reader parameters

        Returns:
            Configured GoogleOpenBuildingsReader instance
        """
        return GoogleOpenBuildingsReader(
            config=config, data_store=data_store, logger=logger, **kwargs
        )

    def load_points(
        self,
        source: Union[
            str,  # country
            List[Union[tuple, Point]],  # points
            BaseGeometry,  # geometry
            gpd.GeoDataFrame,  # geodataframe
            Path,  # path
            List[Union[str, Path]],  # list of paths
        ],
        crop_to_source: bool = False,
        ensure_available: bool = True,
        **kwargs,
    ) -> gpd.GeoDataFrame:
        """
        Load point data from Google Open Buildings dataset.

        Args:
            source: The data source specification
            ensure_available: If True, ensure data is downloaded before loading
            **kwargs: Additional parameters passed to load methods

        Returns:
            GeoDataFrame containing building point data
        """
        return self.load_data(
            source=source,
            crop_to_source=crop_to_source,
            ensure_available=ensure_available,
            data_type="points",
            **kwargs,
        )

    def load_polygons(
        self,
        source: Union[
            str,  # country
            List[Union[tuple, Point]],  # points
            BaseGeometry,  # geometry
            gpd.GeoDataFrame,  # geodataframe
            Path,  # path
            List[Union[str, Path]],  # list of paths
        ],
        crop_to_source: bool = False,
        ensure_available: bool = True,
        **kwargs,
    ) -> gpd.GeoDataFrame:
        """
        Load polygon data from Google Open Buildings dataset.

        Args:
            source: The data source specification
            ensure_available: If True, ensure data is downloaded before loading
            **kwargs: Additional parameters passed to load methods

        Returns:
            GeoDataFrame containing building polygon data
        """
        return self.load_data(
            source=source,
            crop_to_source=crop_to_source,
            ensure_available=ensure_available,
            data_type="polygons",
            **kwargs,
        )
create_config(data_store, logger, **kwargs)

Create and return a GoogleOpenBuildingsConfig instance.

Parameters:

Name Type Description Default
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional configuration parameters

{}

Returns:

Type Description
GoogleOpenBuildingsConfig

Configured GoogleOpenBuildingsConfig instance

Source code in gigaspatial/handlers/google_open_buildings.py
def create_config(
    self, data_store: DataStore, logger: logging.Logger, **kwargs
) -> GoogleOpenBuildingsConfig:
    """
    Create and return a GoogleOpenBuildingsConfig instance.

    Args:
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional configuration parameters

    Returns:
        Configured GoogleOpenBuildingsConfig instance
    """
    return GoogleOpenBuildingsConfig(data_store=data_store, logger=logger, **kwargs)
create_downloader(config, data_store, logger, **kwargs)

Create and return a GoogleOpenBuildingsDownloader instance.

Parameters:

Name Type Description Default
config GoogleOpenBuildingsConfig

The configuration object

required
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional downloader parameters

{}

Returns:

Type Description
GoogleOpenBuildingsDownloader

Configured GoogleOpenBuildingsDownloader instance

Source code in gigaspatial/handlers/google_open_buildings.py
def create_downloader(
    self,
    config: GoogleOpenBuildingsConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> GoogleOpenBuildingsDownloader:
    """
    Create and return a GoogleOpenBuildingsDownloader instance.

    Args:
        config: The configuration object
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional downloader parameters

    Returns:
        Configured GoogleOpenBuildingsDownloader instance
    """
    return GoogleOpenBuildingsDownloader(
        config=config, data_store=data_store, logger=logger, **kwargs
    )
create_reader(config, data_store, logger, **kwargs)

Create and return a GoogleOpenBuildingsReader instance.

Parameters:

Name Type Description Default
config GoogleOpenBuildingsConfig

The configuration object

required
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional reader parameters

{}

Returns:

Type Description
GoogleOpenBuildingsReader

Configured GoogleOpenBuildingsReader instance

Source code in gigaspatial/handlers/google_open_buildings.py
def create_reader(
    self,
    config: GoogleOpenBuildingsConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> GoogleOpenBuildingsReader:
    """
    Create and return a GoogleOpenBuildingsReader instance.

    Args:
        config: The configuration object
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional reader parameters

    Returns:
        Configured GoogleOpenBuildingsReader instance
    """
    return GoogleOpenBuildingsReader(
        config=config, data_store=data_store, logger=logger, **kwargs
    )
load_points(source, crop_to_source=False, ensure_available=True, **kwargs)

Load point data from Google Open Buildings dataset.

Parameters:

Name Type Description Default
source Union[str, List[Union[tuple, Point]], BaseGeometry, GeoDataFrame, Path, List[Union[str, Path]]]

The data source specification

required
ensure_available bool

If True, ensure data is downloaded before loading

True
**kwargs

Additional parameters passed to load methods

{}

Returns:

Type Description
GeoDataFrame

GeoDataFrame containing building point data

Source code in gigaspatial/handlers/google_open_buildings.py
def load_points(
    self,
    source: Union[
        str,  # country
        List[Union[tuple, Point]],  # points
        BaseGeometry,  # geometry
        gpd.GeoDataFrame,  # geodataframe
        Path,  # path
        List[Union[str, Path]],  # list of paths
    ],
    crop_to_source: bool = False,
    ensure_available: bool = True,
    **kwargs,
) -> gpd.GeoDataFrame:
    """
    Load point data from Google Open Buildings dataset.

    Args:
        source: The data source specification
        ensure_available: If True, ensure data is downloaded before loading
        **kwargs: Additional parameters passed to load methods

    Returns:
        GeoDataFrame containing building point data
    """
    return self.load_data(
        source=source,
        crop_to_source=crop_to_source,
        ensure_available=ensure_available,
        data_type="points",
        **kwargs,
    )
load_polygons(source, crop_to_source=False, ensure_available=True, **kwargs)

Load polygon data from Google Open Buildings dataset.

Parameters:

Name Type Description Default
source Union[str, List[Union[tuple, Point]], BaseGeometry, GeoDataFrame, Path, List[Union[str, Path]]]

The data source specification

required
ensure_available bool

If True, ensure data is downloaded before loading

True
**kwargs

Additional parameters passed to load methods

{}

Returns:

Type Description
GeoDataFrame

GeoDataFrame containing building polygon data

Source code in gigaspatial/handlers/google_open_buildings.py
def load_polygons(
    self,
    source: Union[
        str,  # country
        List[Union[tuple, Point]],  # points
        BaseGeometry,  # geometry
        gpd.GeoDataFrame,  # geodataframe
        Path,  # path
        List[Union[str, Path]],  # list of paths
    ],
    crop_to_source: bool = False,
    ensure_available: bool = True,
    **kwargs,
) -> gpd.GeoDataFrame:
    """
    Load polygon data from Google Open Buildings dataset.

    Args:
        source: The data source specification
        ensure_available: If True, ensure data is downloaded before loading
        **kwargs: Additional parameters passed to load methods

    Returns:
        GeoDataFrame containing building polygon data
    """
    return self.load_data(
        source=source,
        crop_to_source=crop_to_source,
        ensure_available=ensure_available,
        data_type="polygons",
        **kwargs,
    )

GoogleOpenBuildingsReader

Bases: BaseHandlerReader

Reader for Google Open Buildings data, supporting country, points, and geometry-based resolution.

Source code in gigaspatial/handlers/google_open_buildings.py
class GoogleOpenBuildingsReader(BaseHandlerReader):
    """
    Reader for Google Open Buildings data, supporting country, points, and geometry-based resolution.
    """

    def __init__(
        self,
        config: Optional[GoogleOpenBuildingsConfig] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        config = config or GoogleOpenBuildingsConfig()
        super().__init__(config=config, data_store=data_store, logger=logger)

    def load_from_paths(
        self, source_data_path: List[Union[str, Path]], **kwargs
    ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
        """
        Load building data from Google Open Buildings dataset.
        Args:
            source_data_path: List of file paths to load
        Returns:
            GeoDataFrame containing building data
        """
        result = self._load_tabular_data(file_paths=source_data_path)
        return result

    def load(
        self, source, crop_to_source: bool = False, data_type="polygons", **kwargs
    ):
        return super().load(
            source=source, crop_to_source=crop_to_source, data_type=data_type, **kwargs
        )

    def load_points(self, source, crop_to_source: bool = False, **kwargs):
        """This is a convenience method to load points data"""
        return self.load(
            source=source, crop_to_source=crop_to_source, data_type="points", **kwargs
        )

    def load_polygons(self, source, crop_to_source: bool = False, **kwargs):
        """This is a convenience method to load polygons data"""
        return self.load(
            source=source, crop_to_source=crop_to_source, data_type="polygons", **kwargs
        )
load_from_paths(source_data_path, **kwargs)

Load building data from Google Open Buildings dataset. Args: source_data_path: List of file paths to load Returns: GeoDataFrame containing building data

Source code in gigaspatial/handlers/google_open_buildings.py
def load_from_paths(
    self, source_data_path: List[Union[str, Path]], **kwargs
) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
    """
    Load building data from Google Open Buildings dataset.
    Args:
        source_data_path: List of file paths to load
    Returns:
        GeoDataFrame containing building data
    """
    result = self._load_tabular_data(file_paths=source_data_path)
    return result
load_points(source, crop_to_source=False, **kwargs)

This is a convenience method to load points data

Source code in gigaspatial/handlers/google_open_buildings.py
def load_points(self, source, crop_to_source: bool = False, **kwargs):
    """This is a convenience method to load points data"""
    return self.load(
        source=source, crop_to_source=crop_to_source, data_type="points", **kwargs
    )
load_polygons(source, crop_to_source=False, **kwargs)

This is a convenience method to load polygons data

Source code in gigaspatial/handlers/google_open_buildings.py
def load_polygons(self, source, crop_to_source: bool = False, **kwargs):
    """This is a convenience method to load polygons data"""
    return self.load(
        source=source, crop_to_source=crop_to_source, data_type="polygons", **kwargs
    )

hdx

HDXConfig dataclass

Bases: BaseHandlerConfig

Configuration for HDX data access

Source code in gigaspatial/handlers/hdx.py
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class HDXConfig(BaseHandlerConfig):
    """Configuration for HDX data access"""

    # User configuration
    dataset_name: str = Field(
        default=..., description="Name of the HDX dataset to download"
    )

    # Optional configuration with defaults
    base_path: Path = Field(default=global_config.get_path("hdx", "bronze"))
    user_agent: str = Field(
        default="gigaspatial", description="User agent for HDX API requests"
    )
    hdx_site: str = Field(default="prod", description="HDX site to use (prod or test)")

    # Internal state
    _hdx_configured: bool = Field(default=False, init=False)
    dataset: Optional[Dataset] = Field(default=None, init=False)

    @staticmethod
    def search_datasets(
        query: str,
        rows: int = None,
        sort: str = "relevance asc, metadata_modified desc",
        hdx_site: str = "prod",
        user_agent: str = "gigaspatial",
    ) -> List[Dict]:
        """Search for datasets in HDX before initializing the class.

        Args:
            query: Search query string
            rows: Number of results per page. Defaults to all datasets (sys.maxsize).
            sort: Sort order - one of 'relevance', 'views_recent', 'views_total', 'last_modified' (default: 'relevance')
            hdx_site: HDX site to use - 'prod' or 'test' (default: 'prod')
            user_agent: User agent for HDX API requests (default: 'gigaspatial')

        Returns:
            List of dataset dictionaries containing search results

        Example:
            >>> results = HDXConfig.search_datasets("population", rows=5)
            >>> for dataset in results:
            >>>     print(f"Name: {dataset['name']}, Title: {dataset['title']}")
        """
        try:
            Configuration.create(
                hdx_site=hdx_site,
                user_agent=user_agent,
                hdx_read_only=True,
            )
        except:
            pass

        try:
            results = Dataset.search_in_hdx(query=query, rows=rows, sort=sort)

            return results
        except Exception as e:
            logging.error(f"Error searching HDX datasets: {str(e)}")
            raise

    def __post_init__(self):
        super().__post_init__()
        try:
            Configuration.read()
            self._hdx_configured = True
        except Exception:
            self._hdx_configured = False
        self.configure_hdx()
        self.dataset = self.fetch_dataset()

    @property
    def output_dir_path(self) -> Path:
        """Path to save the downloaded HDX dataset"""
        return self.base_path / self.dataset_name

    def configure_hdx(self):
        """Configure HDX API if not already configured"""
        if not self._hdx_configured:
            try:
                Configuration.create(
                    hdx_site=self.hdx_site,
                    user_agent=self.user_agent,
                    hdx_read_only=True,
                )
                self._hdx_configured = True
            except Exception as e:
                self.logger.error(f"Error configuring HDX API: {str(e)}")
                raise

    def fetch_dataset(self) -> Dataset:
        """Get the HDX dataset"""
        try:
            self.logger.info(f"Fetching HDX dataset: {self.dataset_name}")
            dataset = Dataset.read_from_hdx(self.dataset_name)
            if not dataset:
                raise ValueError(
                    f"Dataset '{self.dataset_name}' not found on HDX. "
                    "Please verify the dataset name or use search_datasets() "
                    "to find available datasets."
                )
            return dataset
        except Exception as e:
            self.logger.error(f"Error fetching HDX dataset: {str(e)}")
            raise

    def _match_pattern(self, value: str, pattern: str) -> bool:
        """Check if a value matches a pattern"""
        if isinstance(pattern, str):
            return pattern.lower() in value.lower()
        return value == pattern

    def _get_patterns_for_value(self, value: Any) -> List[str]:
        """Generate patterns for a given value or list of values"""
        if isinstance(value, list):
            patterns = []
            for v in value:
                patterns.extend(self._get_patterns_for_value(v))
            return patterns

        if not isinstance(value, str):
            return [value]

        patterns = []
        value = value.lower()

        # Add exact match
        patterns.append(value)

        # Add common variations
        patterns.extend(
            [
                f"/{value}_",  # URL path with prefix
                f"/{value}.",  # URL path with extension
                f"_{value}_",  # Filename with value in middle
                f"_{value}.",  # Filename with value at end
            ]
        )

        # If value contains spaces, generate additional patterns
        if " " in value:
            # Generate patterns for space-less version
            no_space = value.replace(" ", "")
            patterns.extend(self._get_patterns_for_value(no_space))

            # Generate patterns for hyphenated version
            hyphenated = value.replace(" ", "-")
            patterns.extend(self._get_patterns_for_value(hyphenated))

        return patterns

    def get_dataset_resources(
        self, filter: Optional[Dict[str, Any]] = None, exact_match: bool = False
    ) -> List[Resource]:
        """Get resources from the HDX dataset

        Args:
            filter: Dictionary of key-value pairs to filter resources
            exact_match: If True, perform exact matching. If False, use pattern matching
        """
        try:
            resources = self.dataset.get_resources()

            # Apply resource filter if specified
            if filter:
                filtered_resources = []
                for res in resources:
                    match = True
                    for key, value in filter.items():
                        if key not in res.data:
                            match = False
                            break

                        if exact_match:
                            # For exact matching, check if value matches or is in list of values
                            if isinstance(value, list):
                                if res.data[key] not in value:
                                    match = False
                                    break
                            elif res.data[key] != value:
                                match = False
                                break
                        else:
                            # For pattern matching, generate patterns for value(s)
                            patterns = self._get_patterns_for_value(value)
                            if not any(
                                self._match_pattern(str(res.data[key]), pattern)
                                for pattern in patterns
                            ):
                                match = False
                                break

                    if match:
                        filtered_resources.append(res)
                resources = filtered_resources

            return resources
        except Exception as e:
            self.logger.error(f"Error getting dataset resources: {str(e)}")
            raise

    def get_relevant_data_units_by_geometry(
        self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
    ) -> List[Resource]:
        return self.get_dataset_resources(geometry, **kwargs)

    def get_data_unit_path(self, unit: str, **kwargs) -> str:
        """Get the path for a data unit"""
        try:
            filename = unit.data["name"]
        except:
            filename = unit.get("download_url").split("/")[-1]

        return self.output_dir_path / filename

    def list_resources(self) -> List[str]:
        """List all resources in the dataset directory using the data_store."""
        dataset_folder = str(self.output_dir_path)
        # Check if the dataset directory exists in the data_store
        if not (
            self.data_store.is_dir(dataset_folder)
            or self.data_store.file_exists(dataset_folder)
        ):
            raise FileNotFoundError(
                f"HDX dataset not found at {dataset_folder}. "
                "Download the data first using HDXDownloader."
            )
        return self.data_store.list_files(dataset_folder)

    def extract_search_geometry(self, source, **kwargs):
        """
        Override the base class method since geometry extraction does not apply.
        Returns dictionary to filter.

        Args:
            source: Either a country name/code (str) or a filter dictionary
            **kwargs: Additional keyword arguments passed to the specific method
        """
        if isinstance(source, str):
            country = pycountry.countries.lookup(source)
            values = [country.alpha_3, country.alpha_2, country.name]
            key = kwargs.get(
                "key", "url"
            )  # The key to filter on in the resource data. Defaults to `url`
            return {key: values}
        elif isinstance(source, dict):
            return source
        else:
            raise ValueError(
                f"Unsupported source type: {type(source)}"
                "Please use country-based (str) filtering or direct resource (dict) filtering instead."
            )

    def __repr__(self) -> str:
        return (
            f"HDXConfig(\n"
            f"  dataset_name='{self.dataset_name}'\n"
            f"  base_path='{self.base_path}'\n"
            f"  hdx_site='{self.hdx_site}'\n"
            f"  user_agent='{self.user_agent}'\n"
            f")"
        )
output_dir_path: Path property

Path to save the downloaded HDX dataset

configure_hdx()

Configure HDX API if not already configured

Source code in gigaspatial/handlers/hdx.py
def configure_hdx(self):
    """Configure HDX API if not already configured"""
    if not self._hdx_configured:
        try:
            Configuration.create(
                hdx_site=self.hdx_site,
                user_agent=self.user_agent,
                hdx_read_only=True,
            )
            self._hdx_configured = True
        except Exception as e:
            self.logger.error(f"Error configuring HDX API: {str(e)}")
            raise
extract_search_geometry(source, **kwargs)

Override the base class method since geometry extraction does not apply. Returns dictionary to filter.

Parameters:

Name Type Description Default
source

Either a country name/code (str) or a filter dictionary

required
**kwargs

Additional keyword arguments passed to the specific method

{}
Source code in gigaspatial/handlers/hdx.py
def extract_search_geometry(self, source, **kwargs):
    """
    Override the base class method since geometry extraction does not apply.
    Returns dictionary to filter.

    Args:
        source: Either a country name/code (str) or a filter dictionary
        **kwargs: Additional keyword arguments passed to the specific method
    """
    if isinstance(source, str):
        country = pycountry.countries.lookup(source)
        values = [country.alpha_3, country.alpha_2, country.name]
        key = kwargs.get(
            "key", "url"
        )  # The key to filter on in the resource data. Defaults to `url`
        return {key: values}
    elif isinstance(source, dict):
        return source
    else:
        raise ValueError(
            f"Unsupported source type: {type(source)}"
            "Please use country-based (str) filtering or direct resource (dict) filtering instead."
        )
fetch_dataset()

Get the HDX dataset

Source code in gigaspatial/handlers/hdx.py
def fetch_dataset(self) -> Dataset:
    """Get the HDX dataset"""
    try:
        self.logger.info(f"Fetching HDX dataset: {self.dataset_name}")
        dataset = Dataset.read_from_hdx(self.dataset_name)
        if not dataset:
            raise ValueError(
                f"Dataset '{self.dataset_name}' not found on HDX. "
                "Please verify the dataset name or use search_datasets() "
                "to find available datasets."
            )
        return dataset
    except Exception as e:
        self.logger.error(f"Error fetching HDX dataset: {str(e)}")
        raise
get_data_unit_path(unit, **kwargs)

Get the path for a data unit

Source code in gigaspatial/handlers/hdx.py
def get_data_unit_path(self, unit: str, **kwargs) -> str:
    """Get the path for a data unit"""
    try:
        filename = unit.data["name"]
    except:
        filename = unit.get("download_url").split("/")[-1]

    return self.output_dir_path / filename
get_dataset_resources(filter=None, exact_match=False)

Get resources from the HDX dataset

Parameters:

Name Type Description Default
filter Optional[Dict[str, Any]]

Dictionary of key-value pairs to filter resources

None
exact_match bool

If True, perform exact matching. If False, use pattern matching

False
Source code in gigaspatial/handlers/hdx.py
def get_dataset_resources(
    self, filter: Optional[Dict[str, Any]] = None, exact_match: bool = False
) -> List[Resource]:
    """Get resources from the HDX dataset

    Args:
        filter: Dictionary of key-value pairs to filter resources
        exact_match: If True, perform exact matching. If False, use pattern matching
    """
    try:
        resources = self.dataset.get_resources()

        # Apply resource filter if specified
        if filter:
            filtered_resources = []
            for res in resources:
                match = True
                for key, value in filter.items():
                    if key not in res.data:
                        match = False
                        break

                    if exact_match:
                        # For exact matching, check if value matches or is in list of values
                        if isinstance(value, list):
                            if res.data[key] not in value:
                                match = False
                                break
                        elif res.data[key] != value:
                            match = False
                            break
                    else:
                        # For pattern matching, generate patterns for value(s)
                        patterns = self._get_patterns_for_value(value)
                        if not any(
                            self._match_pattern(str(res.data[key]), pattern)
                            for pattern in patterns
                        ):
                            match = False
                            break

                if match:
                    filtered_resources.append(res)
            resources = filtered_resources

        return resources
    except Exception as e:
        self.logger.error(f"Error getting dataset resources: {str(e)}")
        raise
list_resources()

List all resources in the dataset directory using the data_store.

Source code in gigaspatial/handlers/hdx.py
def list_resources(self) -> List[str]:
    """List all resources in the dataset directory using the data_store."""
    dataset_folder = str(self.output_dir_path)
    # Check if the dataset directory exists in the data_store
    if not (
        self.data_store.is_dir(dataset_folder)
        or self.data_store.file_exists(dataset_folder)
    ):
        raise FileNotFoundError(
            f"HDX dataset not found at {dataset_folder}. "
            "Download the data first using HDXDownloader."
        )
    return self.data_store.list_files(dataset_folder)
search_datasets(query, rows=None, sort='relevance asc, metadata_modified desc', hdx_site='prod', user_agent='gigaspatial') staticmethod

Search for datasets in HDX before initializing the class.

Parameters:

Name Type Description Default
query str

Search query string

required
rows int

Number of results per page. Defaults to all datasets (sys.maxsize).

None
sort str

Sort order - one of 'relevance', 'views_recent', 'views_total', 'last_modified' (default: 'relevance')

'relevance asc, metadata_modified desc'
hdx_site str

HDX site to use - 'prod' or 'test' (default: 'prod')

'prod'
user_agent str

User agent for HDX API requests (default: 'gigaspatial')

'gigaspatial'

Returns:

Type Description
List[Dict]

List of dataset dictionaries containing search results

Example

results = HDXConfig.search_datasets("population", rows=5) for dataset in results: print(f"Name: {dataset['name']}, Title: {dataset['title']}")

Source code in gigaspatial/handlers/hdx.py
@staticmethod
def search_datasets(
    query: str,
    rows: int = None,
    sort: str = "relevance asc, metadata_modified desc",
    hdx_site: str = "prod",
    user_agent: str = "gigaspatial",
) -> List[Dict]:
    """Search for datasets in HDX before initializing the class.

    Args:
        query: Search query string
        rows: Number of results per page. Defaults to all datasets (sys.maxsize).
        sort: Sort order - one of 'relevance', 'views_recent', 'views_total', 'last_modified' (default: 'relevance')
        hdx_site: HDX site to use - 'prod' or 'test' (default: 'prod')
        user_agent: User agent for HDX API requests (default: 'gigaspatial')

    Returns:
        List of dataset dictionaries containing search results

    Example:
        >>> results = HDXConfig.search_datasets("population", rows=5)
        >>> for dataset in results:
        >>>     print(f"Name: {dataset['name']}, Title: {dataset['title']}")
    """
    try:
        Configuration.create(
            hdx_site=hdx_site,
            user_agent=user_agent,
            hdx_read_only=True,
        )
    except:
        pass

    try:
        results = Dataset.search_in_hdx(query=query, rows=rows, sort=sort)

        return results
    except Exception as e:
        logging.error(f"Error searching HDX datasets: {str(e)}")
        raise

HDXDownloader

Bases: BaseHandlerDownloader

Downloader for HDX datasets

Source code in gigaspatial/handlers/hdx.py
class HDXDownloader(BaseHandlerDownloader):
    """Downloader for HDX datasets"""

    def __init__(
        self,
        config: Union[HDXConfig, dict],
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        config = config if isinstance(config, HDXConfig) else HDXConfig(**config)
        super().__init__(config=config, data_store=data_store, logger=logger)

    def download_data_unit(self, resource: str, **kwargs) -> str:
        """Download a single resource"""
        try:
            resource_name = resource.get("name", "Unknown")
            self.logger.info(f"Downloading resource: {resource_name}")

            with tempfile.TemporaryDirectory() as tmpdir:
                url, local_path = resource.download(folder=tmpdir)
                with open(local_path, "rb") as f:
                    data = f.read()
                # Compose the target path in the DataStore
                target_path = str(self.config.get_data_unit_path(resource))
                self.data_store.write_file(target_path, data)
                self.logger.info(
                    f"Downloaded resource: {resource_name} to {target_path}"
                )
                return target_path
        except Exception as e:
            self.logger.error(f"Error downloading resource {resource_name}: {str(e)}")
            return None

    def download_data_units(self, resources: List[Resource], **kwargs) -> List[str]:
        """Download multiple resources sequentially

        Args:
            resources: List of HDX Resource objects
            **kwargs: Additional keyword arguments

        Returns:
            List of paths to downloaded files
        """
        if len(resources) == 0:
            self.logger.warning("There is no resource to download")
            return []

        downloaded_paths = []
        for resource in tqdm(resources, desc="Downloading resources"):
            path = self.download_data_unit(resource)
            if path:
                downloaded_paths.append(path)

        return downloaded_paths
download_data_unit(resource, **kwargs)

Download a single resource

Source code in gigaspatial/handlers/hdx.py
def download_data_unit(self, resource: str, **kwargs) -> str:
    """Download a single resource"""
    try:
        resource_name = resource.get("name", "Unknown")
        self.logger.info(f"Downloading resource: {resource_name}")

        with tempfile.TemporaryDirectory() as tmpdir:
            url, local_path = resource.download(folder=tmpdir)
            with open(local_path, "rb") as f:
                data = f.read()
            # Compose the target path in the DataStore
            target_path = str(self.config.get_data_unit_path(resource))
            self.data_store.write_file(target_path, data)
            self.logger.info(
                f"Downloaded resource: {resource_name} to {target_path}"
            )
            return target_path
    except Exception as e:
        self.logger.error(f"Error downloading resource {resource_name}: {str(e)}")
        return None
download_data_units(resources, **kwargs)

Download multiple resources sequentially

Parameters:

Name Type Description Default
resources List[Resource]

List of HDX Resource objects

required
**kwargs

Additional keyword arguments

{}

Returns:

Type Description
List[str]

List of paths to downloaded files

Source code in gigaspatial/handlers/hdx.py
def download_data_units(self, resources: List[Resource], **kwargs) -> List[str]:
    """Download multiple resources sequentially

    Args:
        resources: List of HDX Resource objects
        **kwargs: Additional keyword arguments

    Returns:
        List of paths to downloaded files
    """
    if len(resources) == 0:
        self.logger.warning("There is no resource to download")
        return []

    downloaded_paths = []
    for resource in tqdm(resources, desc="Downloading resources"):
        path = self.download_data_unit(resource)
        if path:
            downloaded_paths.append(path)

    return downloaded_paths

HDXHandler

Bases: BaseHandler

Handler for HDX datasets

Source code in gigaspatial/handlers/hdx.py
class HDXHandler(BaseHandler):
    """Handler for HDX datasets"""

    def __init__(
        self,
        dataset_name: str,
        config: Optional[HDXConfig] = None,
        downloader: Optional[HDXDownloader] = None,
        reader: Optional[HDXReader] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
        **kwargs,
    ):
        self._dataset_name = dataset_name
        super().__init__(
            config=config,
            downloader=downloader,
            reader=reader,
            data_store=data_store,
            logger=logger,
            **kwargs,
        )

    def create_config(
        self, data_store: DataStore, logger: logging.Logger, **kwargs
    ) -> HDXConfig:
        """Create and return a HDXConfig instance"""
        return HDXConfig(
            dataset_name=self._dataset_name,
            data_store=data_store,
            logger=logger,
            **kwargs,
        )

    def create_downloader(
        self,
        config: HDXConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> HDXDownloader:
        """Create and return a HDXDownloader instance"""
        return HDXDownloader(
            config=config,
            data_store=data_store,
            logger=logger,
            **kwargs,
        )

    def create_reader(
        self,
        config: HDXConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> HDXReader:
        """Create and return a HDXReader instance"""
        return HDXReader(
            config=config,
            data_store=data_store,
            logger=logger,
            **kwargs,
        )
create_config(data_store, logger, **kwargs)

Create and return a HDXConfig instance

Source code in gigaspatial/handlers/hdx.py
def create_config(
    self, data_store: DataStore, logger: logging.Logger, **kwargs
) -> HDXConfig:
    """Create and return a HDXConfig instance"""
    return HDXConfig(
        dataset_name=self._dataset_name,
        data_store=data_store,
        logger=logger,
        **kwargs,
    )
create_downloader(config, data_store, logger, **kwargs)

Create and return a HDXDownloader instance

Source code in gigaspatial/handlers/hdx.py
def create_downloader(
    self,
    config: HDXConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> HDXDownloader:
    """Create and return a HDXDownloader instance"""
    return HDXDownloader(
        config=config,
        data_store=data_store,
        logger=logger,
        **kwargs,
    )
create_reader(config, data_store, logger, **kwargs)

Create and return a HDXReader instance

Source code in gigaspatial/handlers/hdx.py
def create_reader(
    self,
    config: HDXConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> HDXReader:
    """Create and return a HDXReader instance"""
    return HDXReader(
        config=config,
        data_store=data_store,
        logger=logger,
        **kwargs,
    )

HDXReader

Bases: BaseHandlerReader

Reader for HDX datasets

Source code in gigaspatial/handlers/hdx.py
class HDXReader(BaseHandlerReader):
    """Reader for HDX datasets"""

    def __init__(
        self,
        config: Optional[HDXConfig] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        config = config if isinstance(config, HDXConfig) else HDXConfig(**config)
        super().__init__(config=config, data_store=data_store, logger=logger)

    def load_from_paths(
        self, source_data_path: List[Union[str, Path]], **kwargs
    ) -> Any:
        """Load data from paths"""
        if len(source_data_path) == 1:
            return read_dataset(self.data_store, source_data_path[0])

        all_data = {}
        for file_path in source_data_path:
            try:
                all_data[file_path] = read_dataset(self.data_store, file_path)
            except Exception as e:
                raise ValueError(f"Could not read file {file_path}: {str(e)}")
        return all_data

    def load_all_resources(self):
        resources = self.config.list_resources()
        return self.load_from_paths(resources)
load_from_paths(source_data_path, **kwargs)

Load data from paths

Source code in gigaspatial/handlers/hdx.py
def load_from_paths(
    self, source_data_path: List[Union[str, Path]], **kwargs
) -> Any:
    """Load data from paths"""
    if len(source_data_path) == 1:
        return read_dataset(self.data_store, source_data_path[0])

    all_data = {}
    for file_path in source_data_path:
        try:
            all_data[file_path] = read_dataset(self.data_store, file_path)
        except Exception as e:
            raise ValueError(f"Could not read file {file_path}: {str(e)}")
    return all_data

healthsites

HealthSitesFetcher

Fetch and process health facility location data from the Healthsites.io API.

Source code in gigaspatial/handlers/healthsites.py
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class HealthSitesFetcher:
    """
    Fetch and process health facility location data from the Healthsites.io API.
    """

    country: Optional[str] = Field(default=None, description="Country to filter")
    api_url: str = Field(
        default="https://healthsites.io/api/v3/facilities/",
        description="Base URL for the Healthsites API",
    )
    api_key: str = config.HEALTHSITES_API_KEY
    extent: Optional[Tuple[float, float, float, float]] = Field(
        default=None, description="Bounding box as (minLng, minLat, maxLng, maxLat)"
    )
    page_size: int = Field(default=100, description="Number of records per API page")
    flat_properties: bool = Field(
        default=True, description="Show properties in flat format"
    )
    tag_format: str = Field(default="osm", description="Tag format (osm/hxl)")
    output_format: str = Field(
        default="geojson", description="Output format (json/geojson)"
    )
    sleep_time: float = Field(
        default=0.2, description="Sleep time between API requests"
    )

    def __post_init__(self):
        self.logger = config.get_logger(self.__class__.__name__)
        # Convert country code to OSM English name if provided
        if self.country:
            self.country = self._convert_country(self.country)

    def fetch_facilities(self, **kwargs) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
        """
        Fetch and process health facility locations.

        Args:
            **kwargs: Additional parameters for customization
                - country: Override country filter
                - extent: Override extent filter
                - from_date: Get data modified from this timestamp (datetime or string)
                - to_date: Get data modified to this timestamp (datetime or string)
                - page_size: Override default page size
                - sleep_time: Override default sleep time between requests
                - max_pages: Limit the number of pages to fetch
                - output_format: Override output format ('json' or 'geojson')
                - flat_properties: Override flat properties setting

        Returns:
            Union[pd.DataFrame, gpd.GeoDataFrame]: Health facilities data.
                Returns GeoDataFrame for geojson format, DataFrame for json format.
        """
        # Override defaults with kwargs if provided
        country = kwargs.get("country", self.country)
        extent = kwargs.get("extent", self.extent)
        from_date = kwargs.get("from_date", None)
        to_date = kwargs.get("to_date", None)
        page_size = kwargs.get("page_size", self.page_size)
        sleep_time = kwargs.get("sleep_time", self.sleep_time)
        max_pages = kwargs.get("max_pages", None)
        output_format = kwargs.get("output_format", self.output_format)
        flat_properties = kwargs.get("flat_properties", self.flat_properties)

        # Convert country if provided in kwargs
        if country:
            country = self._convert_country(country)

        # Prepare base parameters
        base_params = {
            "api-key": self.api_key,
            "tag-format": self.tag_format,
            "output": output_format,
        }

        # Only add flat-properties if True (don't send it as false, as that makes it flat anyway)
        if flat_properties:
            base_params["flat-properties"] = "true"

        # Add optional filters
        if country:
            base_params["country"] = country

        if extent:
            if len(extent) != 4:
                raise ValueError(
                    "Extent must be a tuple of 4 values: (minLng, minLat, maxLng, maxLat)"
                )
            base_params["extent"] = ",".join(map(str, extent))

        if from_date:
            base_params["from"] = self._format_timestamp(from_date)

        if to_date:
            base_params["to"] = self._format_timestamp(to_date)

        all_data = []
        page = 1

        self.logger.info(
            f"Starting to fetch health facilities for country: {country or 'all countries'}"
        )
        self.logger.info(
            f"Output format: {output_format}, Flat properties: {flat_properties}"
        )

        while True:
            # Check if we've reached max_pages limit
            if max_pages and page > max_pages:
                self.logger.info(f"Reached maximum pages limit: {max_pages}")
                break

            # Add page parameter
            params = base_params.copy()
            params["page"] = page

            try:
                self.logger.debug(f"Fetching page {page} with params: {params}")
                response = requests.get(self.api_url, params=params)
                response.raise_for_status()

                parsed = response.json()

                # Handle different response structures based on output format
                if output_format == "geojson":
                    # GeoJSON returns FeatureCollection with features list
                    data = parsed.get("features", [])
                else:
                    # JSON returns direct list
                    data = parsed if isinstance(parsed, list) else []

            except requests.exceptions.RequestException as e:
                self.logger.error(f"Request failed on page {page}: {e}")
                break
            except ValueError as e:
                self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
                break

            # Check if we got any data
            if not data or not isinstance(data, list):
                self.logger.info(f"No data on page {page}. Stopping.")
                break

            all_data.extend(data)
            self.logger.info(f"Fetched page {page} with {len(data)} records")

            # If we got fewer records than page_size, we've reached the end
            if len(data) < page_size:
                self.logger.info("Reached end of data (partial page received)")
                break

            page += 1

            # Sleep to be respectful to the API
            if sleep_time > 0:
                time.sleep(sleep_time)

        self.logger.info(f"Finished fetching. Total records: {len(all_data)}")

        # Convert to DataFrame/GeoDataFrame based on format
        if not all_data:
            self.logger.warning("No data fetched, returning empty DataFrame")
            if output_format == "geojson":
                return gpd.GeoDataFrame()
            return pd.DataFrame()

        if output_format == "geojson":
            # Use GeoDataFrame.from_features for GeoJSON format
            gdf = gpd.GeoDataFrame.from_features(all_data, crs="EPSG:4326")
            self.logger.info(f"Created GeoDataFrame with {len(gdf)} records")
            return gdf
        else:
            # For JSON format, handle nested structure if flat_properties is False
            if not flat_properties:
                df = self._process_json_with_centroid(all_data)
            else:
                df = pd.DataFrame(all_data)

            self.logger.info(f"Created DataFrame with {len(df)} records")
            return df

    def fetch_statistics(self, **kwargs) -> dict:
        """
        Fetch statistics for health facilities.

        Args:
            **kwargs: Same filtering parameters as fetch_facilities

        Returns:
            dict: Statistics data
        """
        country = kwargs.get("country", self.country)
        extent = kwargs.get("extent", self.extent)
        from_date = kwargs.get("from_date", None)
        to_date = kwargs.get("to_date", None)

        # Convert country if provided
        if country:
            country = self._convert_country(country)

        params = {
            "api-key": self.api_key,
        }

        # Add optional filters
        if country:
            params["country"] = country
        if extent:
            params["extent"] = ",".join(map(str, extent))
        if from_date:
            params["from"] = self._format_timestamp(from_date)
        if to_date:
            params["to"] = self._format_timestamp(to_date)

        try:
            response = requests.get(f"{self.api_url}/statistic/", params=params)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            self.logger.error(f"Request failed for statistics: {e}")
            raise

    def fetch_facility_by_id(self, osm_type: str, osm_id: str) -> dict:
        """
        Fetch a specific facility by OSM type and ID.

        Args:
            osm_type: OSM type (node, way, relation)
            osm_id: OSM ID

        Returns:
            dict: Facility details
        """
        params = {"api-key": self.api_key}

        try:
            url = f"{self.api_url}/{osm_type}/{osm_id}"
            response = requests.get(url, params=params)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            self.logger.error(f"Request failed for facility {osm_type}/{osm_id}: {e}")
            raise

    def _create_dataframe(self, data: List[dict]) -> pd.DataFrame:
        """
        Create DataFrame from API response data.

        Args:
            data: List of facility records

        Returns:
            pd.DataFrame: Processed DataFrame
        """
        if self.output_format == "geojson":
            # Handle GeoJSON format
            records = []
            for feature in data:
                record = feature.get("properties", {}).copy()
                geometry = feature.get("geometry", {})
                coordinates = geometry.get("coordinates", [])

                if coordinates and len(coordinates) >= 2:
                    record["longitude"] = coordinates[0]
                    record["latitude"] = coordinates[1]

                records.append(record)
            return pd.DataFrame(records)
        else:
            # Handle regular JSON format
            return pd.DataFrame(data)

    def _process_json_with_centroid(self, data: List[dict]) -> pd.DataFrame:
        """
        Process JSON data to flatten 'attributes' and 'centroid' fields,
        and extract longitude/latitude from centroid.

        Args:
            data: List of facility records, where each record might contain
                  nested 'attributes' and 'centroid' dictionaries.

        Returns:
            pd.DataFrame: Processed DataFrame with flattened data.
        """
        processed_records = []
        for record in data:
            new_record = {}

            # Flatten top-level keys
            for key, value in record.items():
                if key not in ["attributes", "centroid"]:
                    new_record[key] = value

            # Flatten 'attributes'
            attributes = record.get("attributes", {})
            for attr_key, attr_value in attributes.items():
                new_record[f"{attr_key}"] = attr_value

            # Extract centroid coordinates
            centroid = record.get("centroid", {})
            coordinates = centroid.get("coordinates", [])
            if coordinates and len(coordinates) == 2:
                new_record["longitude"] = coordinates[0]
                new_record["latitude"] = coordinates[1]
            else:
                new_record["longitude"] = None
                new_record["latitude"] = None

            processed_records.append(new_record)

        return pd.DataFrame(processed_records)

    def _convert_country(self, country: str) -> str:
        try:
            # First convert to ISO3 format if needed
            country_obj = pycountry.countries.lookup(country)
            iso3_code = country_obj.alpha_3

            # Get OSM English name using OSMLocationFetcher
            osm_data = OSMLocationFetcher.get_osm_countries(iso3_code=iso3_code)
            osm_name_en = osm_data.get("name:en")

            if not osm_name_en:
                raise ValueError(
                    f"Could not find OSM English name for country: {country}"
                )

            self.logger.info(
                f"Converted country code to OSM English name: {osm_name_en}"
            )

            return osm_name_en

        except LookupError:
            raise ValueError(f"Invalid country code provided: {country}")
        except Exception as e:
            raise ValueError(f"Failed to get OSM English name: {e}")
fetch_facilities(**kwargs)

Fetch and process health facility locations.

Parameters:

Name Type Description Default
**kwargs

Additional parameters for customization - country: Override country filter - extent: Override extent filter - from_date: Get data modified from this timestamp (datetime or string) - to_date: Get data modified to this timestamp (datetime or string) - page_size: Override default page size - sleep_time: Override default sleep time between requests - max_pages: Limit the number of pages to fetch - output_format: Override output format ('json' or 'geojson') - flat_properties: Override flat properties setting

{}

Returns:

Type Description
Union[DataFrame, GeoDataFrame]

Union[pd.DataFrame, gpd.GeoDataFrame]: Health facilities data. Returns GeoDataFrame for geojson format, DataFrame for json format.

Source code in gigaspatial/handlers/healthsites.py
def fetch_facilities(self, **kwargs) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
    """
    Fetch and process health facility locations.

    Args:
        **kwargs: Additional parameters for customization
            - country: Override country filter
            - extent: Override extent filter
            - from_date: Get data modified from this timestamp (datetime or string)
            - to_date: Get data modified to this timestamp (datetime or string)
            - page_size: Override default page size
            - sleep_time: Override default sleep time between requests
            - max_pages: Limit the number of pages to fetch
            - output_format: Override output format ('json' or 'geojson')
            - flat_properties: Override flat properties setting

    Returns:
        Union[pd.DataFrame, gpd.GeoDataFrame]: Health facilities data.
            Returns GeoDataFrame for geojson format, DataFrame for json format.
    """
    # Override defaults with kwargs if provided
    country = kwargs.get("country", self.country)
    extent = kwargs.get("extent", self.extent)
    from_date = kwargs.get("from_date", None)
    to_date = kwargs.get("to_date", None)
    page_size = kwargs.get("page_size", self.page_size)
    sleep_time = kwargs.get("sleep_time", self.sleep_time)
    max_pages = kwargs.get("max_pages", None)
    output_format = kwargs.get("output_format", self.output_format)
    flat_properties = kwargs.get("flat_properties", self.flat_properties)

    # Convert country if provided in kwargs
    if country:
        country = self._convert_country(country)

    # Prepare base parameters
    base_params = {
        "api-key": self.api_key,
        "tag-format": self.tag_format,
        "output": output_format,
    }

    # Only add flat-properties if True (don't send it as false, as that makes it flat anyway)
    if flat_properties:
        base_params["flat-properties"] = "true"

    # Add optional filters
    if country:
        base_params["country"] = country

    if extent:
        if len(extent) != 4:
            raise ValueError(
                "Extent must be a tuple of 4 values: (minLng, minLat, maxLng, maxLat)"
            )
        base_params["extent"] = ",".join(map(str, extent))

    if from_date:
        base_params["from"] = self._format_timestamp(from_date)

    if to_date:
        base_params["to"] = self._format_timestamp(to_date)

    all_data = []
    page = 1

    self.logger.info(
        f"Starting to fetch health facilities for country: {country or 'all countries'}"
    )
    self.logger.info(
        f"Output format: {output_format}, Flat properties: {flat_properties}"
    )

    while True:
        # Check if we've reached max_pages limit
        if max_pages and page > max_pages:
            self.logger.info(f"Reached maximum pages limit: {max_pages}")
            break

        # Add page parameter
        params = base_params.copy()
        params["page"] = page

        try:
            self.logger.debug(f"Fetching page {page} with params: {params}")
            response = requests.get(self.api_url, params=params)
            response.raise_for_status()

            parsed = response.json()

            # Handle different response structures based on output format
            if output_format == "geojson":
                # GeoJSON returns FeatureCollection with features list
                data = parsed.get("features", [])
            else:
                # JSON returns direct list
                data = parsed if isinstance(parsed, list) else []

        except requests.exceptions.RequestException as e:
            self.logger.error(f"Request failed on page {page}: {e}")
            break
        except ValueError as e:
            self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
            break

        # Check if we got any data
        if not data or not isinstance(data, list):
            self.logger.info(f"No data on page {page}. Stopping.")
            break

        all_data.extend(data)
        self.logger.info(f"Fetched page {page} with {len(data)} records")

        # If we got fewer records than page_size, we've reached the end
        if len(data) < page_size:
            self.logger.info("Reached end of data (partial page received)")
            break

        page += 1

        # Sleep to be respectful to the API
        if sleep_time > 0:
            time.sleep(sleep_time)

    self.logger.info(f"Finished fetching. Total records: {len(all_data)}")

    # Convert to DataFrame/GeoDataFrame based on format
    if not all_data:
        self.logger.warning("No data fetched, returning empty DataFrame")
        if output_format == "geojson":
            return gpd.GeoDataFrame()
        return pd.DataFrame()

    if output_format == "geojson":
        # Use GeoDataFrame.from_features for GeoJSON format
        gdf = gpd.GeoDataFrame.from_features(all_data, crs="EPSG:4326")
        self.logger.info(f"Created GeoDataFrame with {len(gdf)} records")
        return gdf
    else:
        # For JSON format, handle nested structure if flat_properties is False
        if not flat_properties:
            df = self._process_json_with_centroid(all_data)
        else:
            df = pd.DataFrame(all_data)

        self.logger.info(f"Created DataFrame with {len(df)} records")
        return df
fetch_facility_by_id(osm_type, osm_id)

Fetch a specific facility by OSM type and ID.

Parameters:

Name Type Description Default
osm_type str

OSM type (node, way, relation)

required
osm_id str

OSM ID

required

Returns:

Name Type Description
dict dict

Facility details

Source code in gigaspatial/handlers/healthsites.py
def fetch_facility_by_id(self, osm_type: str, osm_id: str) -> dict:
    """
    Fetch a specific facility by OSM type and ID.

    Args:
        osm_type: OSM type (node, way, relation)
        osm_id: OSM ID

    Returns:
        dict: Facility details
    """
    params = {"api-key": self.api_key}

    try:
        url = f"{self.api_url}/{osm_type}/{osm_id}"
        response = requests.get(url, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        self.logger.error(f"Request failed for facility {osm_type}/{osm_id}: {e}")
        raise
fetch_statistics(**kwargs)

Fetch statistics for health facilities.

Parameters:

Name Type Description Default
**kwargs

Same filtering parameters as fetch_facilities

{}

Returns:

Name Type Description
dict dict

Statistics data

Source code in gigaspatial/handlers/healthsites.py
def fetch_statistics(self, **kwargs) -> dict:
    """
    Fetch statistics for health facilities.

    Args:
        **kwargs: Same filtering parameters as fetch_facilities

    Returns:
        dict: Statistics data
    """
    country = kwargs.get("country", self.country)
    extent = kwargs.get("extent", self.extent)
    from_date = kwargs.get("from_date", None)
    to_date = kwargs.get("to_date", None)

    # Convert country if provided
    if country:
        country = self._convert_country(country)

    params = {
        "api-key": self.api_key,
    }

    # Add optional filters
    if country:
        params["country"] = country
    if extent:
        params["extent"] = ",".join(map(str, extent))
    if from_date:
        params["from"] = self._format_timestamp(from_date)
    if to_date:
        params["to"] = self._format_timestamp(to_date)

    try:
        response = requests.get(f"{self.api_url}/statistic/", params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        self.logger.error(f"Request failed for statistics: {e}")
        raise

mapbox_image

MapboxImageDownloader

Class to download images from Mapbox Static Images API using a specific style

Source code in gigaspatial/handlers/mapbox_image.py
class MapboxImageDownloader:
    """Class to download images from Mapbox Static Images API using a specific style"""

    BASE_URL = "https://api.mapbox.com/styles/v1"

    def __init__(
        self,
        access_token: str = config.MAPBOX_ACCESS_TOKEN,
        style_id: Optional[str] = None,
        data_store: Optional[DataStore] = None,
    ):
        """
        Initialize the downloader with Mapbox credentials

        Args:
            access_token: Mapbox access token
            style_id: Mapbox style ID to use for image download
            data_store: Instance of DataStore for accessing data storage
        """
        self.access_token = access_token
        self.style_id = style_id if style_id else "mapbox/satellite-v9"
        self.data_store = data_store or LocalDataStore()
        self.logger = config.get_logger(self.__class__.__name__)

    def _construct_url(self, bounds: Iterable[float], image_size: str) -> str:
        """Construct the Mapbox Static Images API URL"""
        bounds_str = f"[{','.join(map(str, bounds))}]"

        return (
            f"{self.BASE_URL}/{self.style_id}/static/{bounds_str}/{image_size}"
            f"?access_token={self.access_token}&attribution=false&logo=false"
        )

    def _download_single_image(self, url: str, output_path: Path) -> bool:
        """Download a single image from URL"""
        try:
            response = requests.get(url)
            response.raise_for_status()

            with self.data_store.open(str(output_path), "wb") as f:
                f.write(response.content)
            return True
        except Exception as e:
            self.logger.warning(f"Error downloading {output_path.name}: {str(e)}")
            return False

    def download_images_by_tiles(
        self,
        mercator_tiles: "MercatorTiles",
        output_dir: Union[str, Path],
        image_size: Tuple[int, int] = (512, 512),
        max_workers: int = 4,
        image_prefix: str = "image_",
    ) -> None:
        """
        Download images for given mercator tiles using the specified style

        Args:
            mercator_tiles: MercatorTiles instance containing quadkeys
            output_dir: Directory to save images
            image_size: Tuple of (width, height) for output images
            max_workers: Maximum number of concurrent downloads
            image_prefix: Prefix for output image names
        """
        output_dir = Path(output_dir)
        # self.data_store.makedirs(str(output_dir), exist_ok=True)

        image_size_str = f"{image_size[0]}x{image_size[1]}"
        total_tiles = len(mercator_tiles.quadkeys)

        self.logger.info(
            f"Downloading {total_tiles} tiles with size {image_size_str}..."
        )

        def _get_tile_bounds(quadkey: str) -> List[float]:
            """Get tile bounds from quadkey"""
            tile = mercantile.quadkey_to_tile(quadkey)
            bounds = mercantile.bounds(tile)
            return [bounds.west, bounds.south, bounds.east, bounds.north]

        def download_image(quadkey: str) -> bool:
            bounds = _get_tile_bounds(quadkey)
            file_name = f"{image_prefix}{quadkey}.png"

            url = self._construct_url(bounds, image_size_str)
            success = self._download_single_image(url, output_dir / file_name)

            return success

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [
                executor.submit(download_image, quadkey)
                for quadkey in mercator_tiles.quadkeys
            ]

            successful_downloads = 0
            with tqdm(total=total_tiles) as pbar:
                for future in as_completed(futures):
                    if future.result():
                        successful_downloads += 1
                    pbar.update(1)

        self.logger.info(
            f"Successfully downloaded {successful_downloads}/{total_tiles} images!"
        )

    def download_images_by_bounds(
        self,
        gdf: gpd.GeoDataFrame,
        output_dir: Union[str, Path],
        image_size: Tuple[int, int] = (512, 512),
        max_workers: int = 4,
        image_prefix: str = "image_",
    ) -> None:
        """
        Download images for given points using the specified style

        Args:
            gdf_points: GeoDataFrame containing bounding box polygons
            output_dir: Directory to save images
            image_size: Tuple of (width, height) for output images
            max_workers: Maximum number of concurrent downloads
            image_prefix: Prefix for output image names
        """
        output_dir = Path(output_dir)
        # self.data_store.makedirs(str(output_dir), exist_ok=True)

        image_size_str = f"{image_size[0]}x{image_size[1]}"
        total_images = len(gdf)

        self.logger.info(
            f"Downloading {total_images} images with size {image_size_str}..."
        )

        def download_image(idx: Any, bounds: Tuple[float, float, float, float]) -> bool:
            file_name = f"{image_prefix}{idx}.png"
            url = self._construct_url(bounds, image_size_str)
            success = self._download_single_image(url, output_dir / file_name)
            return success

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [
                executor.submit(download_image, row.Index, row.geometry.bounds)
                for row in gdf.itertuples()
            ]

            successful_downloads = 0
            with tqdm(total=total_images) as pbar:
                for future in as_completed(futures):
                    if future.result():
                        successful_downloads += 1
                    pbar.update(1)

        self.logger.info(
            f"Successfully downloaded {successful_downloads}/{total_images} images!"
        )

    def download_images_by_coordinates(
        self,
        data: Union[pd.DataFrame, List[Tuple[float, float]]],
        res_meters_pixel: float,
        output_dir: Union[str, Path],
        image_size: Tuple[int, int] = (512, 512),
        max_workers: int = 4,
        image_prefix: str = "image_",
    ) -> None:
        """
        Download images for given coordinates by creating bounded boxes around points

        Args:
            data: Either a DataFrame with either latitude/longitude columns or a geometry column or a list of (lat, lon) tuples
            res_meters_pixel: Size of the bounding box in meters (creates a square)
            output_dir: Directory to save images
            image_size: Tuple of (width, height) for output images
            max_workers: Maximum number of concurrent downloads
            image_prefix: Prefix for output image names
        """

        if isinstance(data, pd.DataFrame):
            coordinates_df = data
        else:
            coordinates_df = pd.DataFrame(data, columns=["latitude", "longitude"])

        gdf = convert_to_geodataframe(coordinates_df)

        buffered_gdf = buffer_geodataframe(
            gdf, res_meters_pixel / 2, cap_style="square"
        )

        self.download_images_by_bounds(
            buffered_gdf, output_dir, image_size, max_workers, image_prefix
        )
__init__(access_token=config.MAPBOX_ACCESS_TOKEN, style_id=None, data_store=None)

Initialize the downloader with Mapbox credentials

Parameters:

Name Type Description Default
access_token str

Mapbox access token

MAPBOX_ACCESS_TOKEN
style_id Optional[str]

Mapbox style ID to use for image download

None
data_store Optional[DataStore]

Instance of DataStore for accessing data storage

None
Source code in gigaspatial/handlers/mapbox_image.py
def __init__(
    self,
    access_token: str = config.MAPBOX_ACCESS_TOKEN,
    style_id: Optional[str] = None,
    data_store: Optional[DataStore] = None,
):
    """
    Initialize the downloader with Mapbox credentials

    Args:
        access_token: Mapbox access token
        style_id: Mapbox style ID to use for image download
        data_store: Instance of DataStore for accessing data storage
    """
    self.access_token = access_token
    self.style_id = style_id if style_id else "mapbox/satellite-v9"
    self.data_store = data_store or LocalDataStore()
    self.logger = config.get_logger(self.__class__.__name__)
download_images_by_bounds(gdf, output_dir, image_size=(512, 512), max_workers=4, image_prefix='image_')

Download images for given points using the specified style

Parameters:

Name Type Description Default
gdf_points

GeoDataFrame containing bounding box polygons

required
output_dir Union[str, Path]

Directory to save images

required
image_size Tuple[int, int]

Tuple of (width, height) for output images

(512, 512)
max_workers int

Maximum number of concurrent downloads

4
image_prefix str

Prefix for output image names

'image_'
Source code in gigaspatial/handlers/mapbox_image.py
def download_images_by_bounds(
    self,
    gdf: gpd.GeoDataFrame,
    output_dir: Union[str, Path],
    image_size: Tuple[int, int] = (512, 512),
    max_workers: int = 4,
    image_prefix: str = "image_",
) -> None:
    """
    Download images for given points using the specified style

    Args:
        gdf_points: GeoDataFrame containing bounding box polygons
        output_dir: Directory to save images
        image_size: Tuple of (width, height) for output images
        max_workers: Maximum number of concurrent downloads
        image_prefix: Prefix for output image names
    """
    output_dir = Path(output_dir)
    # self.data_store.makedirs(str(output_dir), exist_ok=True)

    image_size_str = f"{image_size[0]}x{image_size[1]}"
    total_images = len(gdf)

    self.logger.info(
        f"Downloading {total_images} images with size {image_size_str}..."
    )

    def download_image(idx: Any, bounds: Tuple[float, float, float, float]) -> bool:
        file_name = f"{image_prefix}{idx}.png"
        url = self._construct_url(bounds, image_size_str)
        success = self._download_single_image(url, output_dir / file_name)
        return success

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(download_image, row.Index, row.geometry.bounds)
            for row in gdf.itertuples()
        ]

        successful_downloads = 0
        with tqdm(total=total_images) as pbar:
            for future in as_completed(futures):
                if future.result():
                    successful_downloads += 1
                pbar.update(1)

    self.logger.info(
        f"Successfully downloaded {successful_downloads}/{total_images} images!"
    )
download_images_by_coordinates(data, res_meters_pixel, output_dir, image_size=(512, 512), max_workers=4, image_prefix='image_')

Download images for given coordinates by creating bounded boxes around points

Parameters:

Name Type Description Default
data Union[DataFrame, List[Tuple[float, float]]]

Either a DataFrame with either latitude/longitude columns or a geometry column or a list of (lat, lon) tuples

required
res_meters_pixel float

Size of the bounding box in meters (creates a square)

required
output_dir Union[str, Path]

Directory to save images

required
image_size Tuple[int, int]

Tuple of (width, height) for output images

(512, 512)
max_workers int

Maximum number of concurrent downloads

4
image_prefix str

Prefix for output image names

'image_'
Source code in gigaspatial/handlers/mapbox_image.py
def download_images_by_coordinates(
    self,
    data: Union[pd.DataFrame, List[Tuple[float, float]]],
    res_meters_pixel: float,
    output_dir: Union[str, Path],
    image_size: Tuple[int, int] = (512, 512),
    max_workers: int = 4,
    image_prefix: str = "image_",
) -> None:
    """
    Download images for given coordinates by creating bounded boxes around points

    Args:
        data: Either a DataFrame with either latitude/longitude columns or a geometry column or a list of (lat, lon) tuples
        res_meters_pixel: Size of the bounding box in meters (creates a square)
        output_dir: Directory to save images
        image_size: Tuple of (width, height) for output images
        max_workers: Maximum number of concurrent downloads
        image_prefix: Prefix for output image names
    """

    if isinstance(data, pd.DataFrame):
        coordinates_df = data
    else:
        coordinates_df = pd.DataFrame(data, columns=["latitude", "longitude"])

    gdf = convert_to_geodataframe(coordinates_df)

    buffered_gdf = buffer_geodataframe(
        gdf, res_meters_pixel / 2, cap_style="square"
    )

    self.download_images_by_bounds(
        buffered_gdf, output_dir, image_size, max_workers, image_prefix
    )
download_images_by_tiles(mercator_tiles, output_dir, image_size=(512, 512), max_workers=4, image_prefix='image_')

Download images for given mercator tiles using the specified style

Parameters:

Name Type Description Default
mercator_tiles MercatorTiles

MercatorTiles instance containing quadkeys

required
output_dir Union[str, Path]

Directory to save images

required
image_size Tuple[int, int]

Tuple of (width, height) for output images

(512, 512)
max_workers int

Maximum number of concurrent downloads

4
image_prefix str

Prefix for output image names

'image_'
Source code in gigaspatial/handlers/mapbox_image.py
def download_images_by_tiles(
    self,
    mercator_tiles: "MercatorTiles",
    output_dir: Union[str, Path],
    image_size: Tuple[int, int] = (512, 512),
    max_workers: int = 4,
    image_prefix: str = "image_",
) -> None:
    """
    Download images for given mercator tiles using the specified style

    Args:
        mercator_tiles: MercatorTiles instance containing quadkeys
        output_dir: Directory to save images
        image_size: Tuple of (width, height) for output images
        max_workers: Maximum number of concurrent downloads
        image_prefix: Prefix for output image names
    """
    output_dir = Path(output_dir)
    # self.data_store.makedirs(str(output_dir), exist_ok=True)

    image_size_str = f"{image_size[0]}x{image_size[1]}"
    total_tiles = len(mercator_tiles.quadkeys)

    self.logger.info(
        f"Downloading {total_tiles} tiles with size {image_size_str}..."
    )

    def _get_tile_bounds(quadkey: str) -> List[float]:
        """Get tile bounds from quadkey"""
        tile = mercantile.quadkey_to_tile(quadkey)
        bounds = mercantile.bounds(tile)
        return [bounds.west, bounds.south, bounds.east, bounds.north]

    def download_image(quadkey: str) -> bool:
        bounds = _get_tile_bounds(quadkey)
        file_name = f"{image_prefix}{quadkey}.png"

        url = self._construct_url(bounds, image_size_str)
        success = self._download_single_image(url, output_dir / file_name)

        return success

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(download_image, quadkey)
            for quadkey in mercator_tiles.quadkeys
        ]

        successful_downloads = 0
        with tqdm(total=total_tiles) as pbar:
            for future in as_completed(futures):
                if future.result():
                    successful_downloads += 1
                pbar.update(1)

    self.logger.info(
        f"Successfully downloaded {successful_downloads}/{total_tiles} images!"
    )

maxar_image

MaxarConfig

Bases: BaseModel

Configuration for Maxar Image Downloader using Pydantic

Source code in gigaspatial/handlers/maxar_image.py
class MaxarConfig(BaseModel):
    """Configuration for Maxar Image Downloader using Pydantic"""

    username: str = Field(
        default=global_config.MAXAR_USERNAME, description="Maxar API username"
    )
    password: str = Field(
        default=global_config.MAXAR_PASSWORD, description="Maxar API password"
    )
    connection_string: str = Field(
        default=global_config.MAXAR_CONNECTION_STRING,
        description="Maxar WMS connection string",
    )

    base_url: HttpUrl = Field(
        default="https://evwhs.digitalglobe.com/mapservice/wmsaccess?",
        description="Base URL for Maxar WMS service",
    )

    layers: List[Literal["DigitalGlobe:ImageryFootprint", "DigitalGlobe:Imagery"]] = (
        Field(
            default=["DigitalGlobe:Imagery"],
            description="List of layers to request from WMS",
        )
    )

    feature_profile: str = Field(
        default="Most_Aesthetic_Mosaic_Profile",
        description="Feature profile to use for WMS requests",
    )

    coverage_cql_filter: str = Field(
        default="", description="CQL filter for coverage selection"
    )

    exceptions: str = Field(
        default="application/vnd.ogc.se_xml",
        description="Exception handling format for WMS",
    )

    transparent: bool = Field(
        default=True,
        description="Whether the requested images should have transparency",
    )

    image_format: Literal["image/png", "image/jpeg", "image/geotiff"] = Field(
        default="image/png",
    )

    data_crs: Literal["EPSG:4326", "EPSG:3395", "EPSG:3857", "CAR:42004"] = Field(
        default="EPSG:4326"
    )

    max_retries: int = Field(
        default=3, description="Number of retries for failed image downloads"
    )

    retry_delay: int = Field(default=5, description="Delay in seconds between retries")

    @field_validator("username", "password", "connection_string")
    @classmethod
    def validate_non_empty(cls, value: str, field) -> str:
        """Ensure required credentials are provided"""
        if not value or value.strip() == "":
            raise ValueError(
                f"{field.name} cannot be empty. Please provide a valid {field.name}."
            )
        return value

    @property
    def wms_url(self) -> str:
        """Generate the full WMS URL with connection string"""
        return f"{self.base_url}connectid={self.connection_string}"

    @property
    def suffix(self) -> str:
        return f".{self.image_format.split('/')[1]}"
wms_url: str property

Generate the full WMS URL with connection string

validate_non_empty(value, field) classmethod

Ensure required credentials are provided

Source code in gigaspatial/handlers/maxar_image.py
@field_validator("username", "password", "connection_string")
@classmethod
def validate_non_empty(cls, value: str, field) -> str:
    """Ensure required credentials are provided"""
    if not value or value.strip() == "":
        raise ValueError(
            f"{field.name} cannot be empty. Please provide a valid {field.name}."
        )
    return value

MaxarImageDownloader

Class to download images from Maxar

Source code in gigaspatial/handlers/maxar_image.py
class MaxarImageDownloader:
    """Class to download images from Maxar"""

    def __init__(
        self,
        config: Optional[MaxarConfig] = None,
        data_store: Optional[DataStore] = None,
    ):
        """
        Initialize the downloader with Maxar config.

        Args:
            config: MaxarConfig instance containing credentials and settings
            data_store: Instance of DataStore for accessing data storage
        """
        self.config = config or MaxarConfig()
        self.wms = WebMapService(
            self.config.wms_url,
            username=self.config.username,
            password=self.config.password,
        )
        self.data_store = data_store or LocalDataStore()
        self.logger = global_config.get_logger(self.__class__.__name__)

    def _download_single_image(self, bbox, output_path: Union[Path, str], size) -> bool:
        """Download a single image from bbox and pixel size"""
        for attempt in range(self.config.max_retries):
            try:
                img_data = self.wms.getmap(
                    bbox=bbox,
                    layers=self.config.layers,
                    srs=self.config.data_crs,
                    size=size,
                    featureProfile=self.config.feature_profile,
                    coverage_cql_filter=self.config.coverage_cql_filter,
                    exceptions=self.config.exceptions,
                    transparent=self.config.transparent,
                    format=self.config.image_format,
                )
                self.data_store.write_file(str(output_path), img_data.read())
                return True
            except Exception as e:
                self.logger.warning(
                    f"Attempt {attempt + 1} of downloading {output_path.name} failed: {str(e)}"
                )
                if attempt < self.config.max_retries - 1:
                    sleep(self.config.retry_delay)
                else:
                    self.logger.warning(
                        f"Failed to download {output_path.name} after {self.config.max_retries} attemps: {str(e)}"
                    )
                    return False

    def download_images_by_tiles(
        self,
        mercator_tiles: "MercatorTiles",
        output_dir: Union[str, Path],
        image_size: Tuple[int, int] = (512, 512),
        image_prefix: str = "maxar_image_",
    ) -> None:
        """
        Download images for given mercator tiles using the specified style

        Args:
            mercator_tiles: MercatorTiles instance containing quadkeys
            output_dir: Directory to save images
            image_size: Tuple of (width, height) for output images
            image_prefix: Prefix for output image names
        """
        output_dir = Path(output_dir)

        image_size_str = f"{image_size[0]}x{image_size[1]}"
        total_tiles = len(mercator_tiles.quadkeys)

        self.logger.info(
            f"Downloading {total_tiles} tiles with size {image_size_str}..."
        )

        def _get_tile_bounds(quadkey: str) -> Tuple[float]:
            """Get tile bounds from quadkey"""
            tile = mercantile.quadkey_to_tile(quadkey)
            bounds = mercantile.bounds(tile)
            return (bounds.west, bounds.south, bounds.east, bounds.north)

        def download_image(
            quadkey: str, image_size: Tuple[int, int], suffix: str = self.config.suffix
        ) -> bool:
            bounds = _get_tile_bounds(quadkey)
            file_name = f"{image_prefix}{quadkey}{suffix}"

            success = self._download_single_image(
                bounds, output_dir / file_name, image_size
            )

            return success

        successful_downloads = 0
        with tqdm(total=total_tiles) as pbar:
            for quadkey in mercator_tiles.quadkeys:
                if download_image(quadkey, image_size):
                    successful_downloads += 1
                pbar.update(1)

        self.logger.info(
            f"Successfully downloaded {successful_downloads}/{total_tiles} images!"
        )

    def download_images_by_bounds(
        self,
        gdf: gpd.GeoDataFrame,
        output_dir: Union[str, Path],
        image_size: Tuple[int, int] = (512, 512),
        image_prefix: str = "maxar_image_",
    ) -> None:
        """
        Download images for given points using the specified style

        Args:
            gdf_points: GeoDataFrame containing bounding box polygons
            output_dir: Directory to save images
            image_size: Tuple of (width, height) for output images
            image_prefix: Prefix for output image names
        """
        output_dir = Path(output_dir)

        image_size_str = f"{image_size[0]}x{image_size[1]}"
        total_images = len(gdf)

        self.logger.info(
            f"Downloading {total_images} images with size {image_size_str}..."
        )

        def download_image(
            idx: Any,
            bounds: Tuple[float, float, float, float],
            image_size,
            suffix: str = self.config.suffix,
        ) -> bool:
            file_name = f"{image_prefix}{idx}{suffix}"
            success = self._download_single_image(
                bounds, output_dir / file_name, image_size
            )
            return success

        gdf = gdf.to_crs(self.config.data_crs)

        successful_downloads = 0
        with tqdm(total=total_images) as pbar:
            for row in gdf.itertuples():
                if download_image(row.Index, tuple(row.geometry.bounds), image_size):
                    successful_downloads += 1
                pbar.update(1)

        self.logger.info(
            f"Successfully downloaded {successful_downloads}/{total_images} images!"
        )

    def download_images_by_coordinates(
        self,
        data: Union[pd.DataFrame, List[Tuple[float, float]]],
        res_meters_pixel: float,
        output_dir: Union[str, Path],
        image_size: Tuple[int, int] = (512, 512),
        image_prefix: str = "maxar_image_",
    ) -> None:
        """
        Download images for given coordinates by creating bounded boxes around points

        Args:
            data: Either a DataFrame with either latitude/longitude columns or a geometry column or a list of (lat, lon) tuples
            res_meters_pixel: resolution in meters per pixel
            output_dir: Directory to save images
            image_size: Tuple of (width, height) for output images
            image_prefix: Prefix for output image names
        """

        if isinstance(data, pd.DataFrame):
            coordinates_df = data
        else:
            coordinates_df = pd.DataFrame(data, columns=["latitude", "longitude"])

        gdf = convert_to_geodataframe(coordinates_df)

        buffered_gdf = buffer_geodataframe(
            gdf, res_meters_pixel / 2, cap_style="square"
        )

        buffered_gdf = buffered_gdf.to_crs(self.config.data_crs)

        self.download_images_by_bounds(
            buffered_gdf, output_dir, image_size, image_prefix
        )
__init__(config=None, data_store=None)

Initialize the downloader with Maxar config.

Parameters:

Name Type Description Default
config Optional[MaxarConfig]

MaxarConfig instance containing credentials and settings

None
data_store Optional[DataStore]

Instance of DataStore for accessing data storage

None
Source code in gigaspatial/handlers/maxar_image.py
def __init__(
    self,
    config: Optional[MaxarConfig] = None,
    data_store: Optional[DataStore] = None,
):
    """
    Initialize the downloader with Maxar config.

    Args:
        config: MaxarConfig instance containing credentials and settings
        data_store: Instance of DataStore for accessing data storage
    """
    self.config = config or MaxarConfig()
    self.wms = WebMapService(
        self.config.wms_url,
        username=self.config.username,
        password=self.config.password,
    )
    self.data_store = data_store or LocalDataStore()
    self.logger = global_config.get_logger(self.__class__.__name__)
download_images_by_bounds(gdf, output_dir, image_size=(512, 512), image_prefix='maxar_image_')

Download images for given points using the specified style

Parameters:

Name Type Description Default
gdf_points

GeoDataFrame containing bounding box polygons

required
output_dir Union[str, Path]

Directory to save images

required
image_size Tuple[int, int]

Tuple of (width, height) for output images

(512, 512)
image_prefix str

Prefix for output image names

'maxar_image_'
Source code in gigaspatial/handlers/maxar_image.py
def download_images_by_bounds(
    self,
    gdf: gpd.GeoDataFrame,
    output_dir: Union[str, Path],
    image_size: Tuple[int, int] = (512, 512),
    image_prefix: str = "maxar_image_",
) -> None:
    """
    Download images for given points using the specified style

    Args:
        gdf_points: GeoDataFrame containing bounding box polygons
        output_dir: Directory to save images
        image_size: Tuple of (width, height) for output images
        image_prefix: Prefix for output image names
    """
    output_dir = Path(output_dir)

    image_size_str = f"{image_size[0]}x{image_size[1]}"
    total_images = len(gdf)

    self.logger.info(
        f"Downloading {total_images} images with size {image_size_str}..."
    )

    def download_image(
        idx: Any,
        bounds: Tuple[float, float, float, float],
        image_size,
        suffix: str = self.config.suffix,
    ) -> bool:
        file_name = f"{image_prefix}{idx}{suffix}"
        success = self._download_single_image(
            bounds, output_dir / file_name, image_size
        )
        return success

    gdf = gdf.to_crs(self.config.data_crs)

    successful_downloads = 0
    with tqdm(total=total_images) as pbar:
        for row in gdf.itertuples():
            if download_image(row.Index, tuple(row.geometry.bounds), image_size):
                successful_downloads += 1
            pbar.update(1)

    self.logger.info(
        f"Successfully downloaded {successful_downloads}/{total_images} images!"
    )
download_images_by_coordinates(data, res_meters_pixel, output_dir, image_size=(512, 512), image_prefix='maxar_image_')

Download images for given coordinates by creating bounded boxes around points

Parameters:

Name Type Description Default
data Union[DataFrame, List[Tuple[float, float]]]

Either a DataFrame with either latitude/longitude columns or a geometry column or a list of (lat, lon) tuples

required
res_meters_pixel float

resolution in meters per pixel

required
output_dir Union[str, Path]

Directory to save images

required
image_size Tuple[int, int]

Tuple of (width, height) for output images

(512, 512)
image_prefix str

Prefix for output image names

'maxar_image_'
Source code in gigaspatial/handlers/maxar_image.py
def download_images_by_coordinates(
    self,
    data: Union[pd.DataFrame, List[Tuple[float, float]]],
    res_meters_pixel: float,
    output_dir: Union[str, Path],
    image_size: Tuple[int, int] = (512, 512),
    image_prefix: str = "maxar_image_",
) -> None:
    """
    Download images for given coordinates by creating bounded boxes around points

    Args:
        data: Either a DataFrame with either latitude/longitude columns or a geometry column or a list of (lat, lon) tuples
        res_meters_pixel: resolution in meters per pixel
        output_dir: Directory to save images
        image_size: Tuple of (width, height) for output images
        image_prefix: Prefix for output image names
    """

    if isinstance(data, pd.DataFrame):
        coordinates_df = data
    else:
        coordinates_df = pd.DataFrame(data, columns=["latitude", "longitude"])

    gdf = convert_to_geodataframe(coordinates_df)

    buffered_gdf = buffer_geodataframe(
        gdf, res_meters_pixel / 2, cap_style="square"
    )

    buffered_gdf = buffered_gdf.to_crs(self.config.data_crs)

    self.download_images_by_bounds(
        buffered_gdf, output_dir, image_size, image_prefix
    )
download_images_by_tiles(mercator_tiles, output_dir, image_size=(512, 512), image_prefix='maxar_image_')

Download images for given mercator tiles using the specified style

Parameters:

Name Type Description Default
mercator_tiles MercatorTiles

MercatorTiles instance containing quadkeys

required
output_dir Union[str, Path]

Directory to save images

required
image_size Tuple[int, int]

Tuple of (width, height) for output images

(512, 512)
image_prefix str

Prefix for output image names

'maxar_image_'
Source code in gigaspatial/handlers/maxar_image.py
def download_images_by_tiles(
    self,
    mercator_tiles: "MercatorTiles",
    output_dir: Union[str, Path],
    image_size: Tuple[int, int] = (512, 512),
    image_prefix: str = "maxar_image_",
) -> None:
    """
    Download images for given mercator tiles using the specified style

    Args:
        mercator_tiles: MercatorTiles instance containing quadkeys
        output_dir: Directory to save images
        image_size: Tuple of (width, height) for output images
        image_prefix: Prefix for output image names
    """
    output_dir = Path(output_dir)

    image_size_str = f"{image_size[0]}x{image_size[1]}"
    total_tiles = len(mercator_tiles.quadkeys)

    self.logger.info(
        f"Downloading {total_tiles} tiles with size {image_size_str}..."
    )

    def _get_tile_bounds(quadkey: str) -> Tuple[float]:
        """Get tile bounds from quadkey"""
        tile = mercantile.quadkey_to_tile(quadkey)
        bounds = mercantile.bounds(tile)
        return (bounds.west, bounds.south, bounds.east, bounds.north)

    def download_image(
        quadkey: str, image_size: Tuple[int, int], suffix: str = self.config.suffix
    ) -> bool:
        bounds = _get_tile_bounds(quadkey)
        file_name = f"{image_prefix}{quadkey}{suffix}"

        success = self._download_single_image(
            bounds, output_dir / file_name, image_size
        )

        return success

    successful_downloads = 0
    with tqdm(total=total_tiles) as pbar:
        for quadkey in mercator_tiles.quadkeys:
            if download_image(quadkey, image_size):
                successful_downloads += 1
            pbar.update(1)

    self.logger.info(
        f"Successfully downloaded {successful_downloads}/{total_tiles} images!"
    )

microsoft_global_buildings

MSBuildingsConfig dataclass

Bases: BaseHandlerConfig

Configuration for Microsoft Global Buildings dataset files.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class MSBuildingsConfig(BaseHandlerConfig):
    """Configuration for Microsoft Global Buildings dataset files."""

    TILE_URLS: str = (
        "https://minedbuildings.z5.web.core.windows.net/global-buildings/dataset-links.csv"
    )
    MERCATOR_ZOOM_LEVEL: int = 9
    base_path: Path = global_config.get_path("microsoft_global_buildings", "bronze")

    LOCATION_MAPPING_FILE: Path = base_path / "location_mapping.json"
    SIMILARITY_SCORE: float = 0.8
    DEFAULT_MAPPING: Dict[str, str] = field(
        default_factory=lambda: {
            "Bonaire": "BES",
            "Brunei": "BRN",
            "IvoryCoast": "CIV",
            "CongoDRC": "COD",
            "DemocraticRepublicoftheCongo": "COD",
            "RepublicoftheCongo": "COG",
            "TheGambia": "GMB",
            "FYROMakedonija": "MKD",
            "SultanateofOman": "OMN",
            "StateofQatar": "QAT",
            "Russia": "RUS",
            "KingdomofSaudiArabia": "SAU",
            "Svalbard": "SJM",
            "Swaziland": "SWZ",
            "StMartin": "SXM",
            "leSaint-Martin": "MAF",
            "Turkey": "TUR",
            "VaticanCity": "VAT",
            "BritishVirginIslands": "VGB",
            "USVirginIslands": "VIR",
            "RepublicofYemen": "YEM",
            "CzechRepublic": "CZE",
            "French-Martinique": "MTQ",
            "French-Guadeloupe": "GLP",
            "UnitedStates": "USA",
        }
    )
    CUSTOM_MAPPING: Optional[Dict[str, str]] = None

    def __post_init__(self):
        """Initialize the configuration, load tile URLs, and set up location mapping."""
        super().__post_init__()
        self._load_tile_urls()
        self.upload_date = self.df_tiles.upload_date[0]
        self._setup_location_mapping()

    def _load_tile_urls(self):
        """Load dataset links from csv file."""
        self.df_tiles = pd.read_csv(
            self.TILE_URLS,
            names=["location", "quadkey", "url", "size", "upload_date"],
            dtype={"quadkey": str},
            header=0,
        )

    def _setup_location_mapping(self):
        """Load or create the mapping between dataset locations and ISO country codes."""
        from gigaspatial.core.io.readers import read_json
        from gigaspatial.core.io.writers import write_json

        if self.data_store.file_exists(str(self.LOCATION_MAPPING_FILE)):
            self.location_mapping = read_json(
                self.data_store, str(self.LOCATION_MAPPING_FILE)
            )
        else:
            self.location_mapping = self.create_location_mapping(
                similarity_score_threshold=self.SIMILARITY_SCORE
            )
            self.location_mapping.update(self.DEFAULT_MAPPING)
            write_json(
                self.location_mapping, self.data_store, str(self.LOCATION_MAPPING_FILE)
            )

        self.location_mapping.update(self.CUSTOM_MAPPING or {})
        self._map_locations()
        self.df_tiles.loc[self.df_tiles.country.isnull(), "country"] = None

    def _map_locations(self):
        """Map the 'location' column in the tiles DataFrame to ISO country codes."""
        self.df_tiles["country"] = self.df_tiles.location.map(self.location_mapping)

    def create_location_mapping(self, similarity_score_threshold: float = 0.8):
        """
        Create a mapping between the dataset's location names and ISO 3166-1 alpha-3 country codes.

        This function iterates through known countries and attempts to find matching
        locations in the dataset based on string similarity.

        Args:
            similarity_score_threshold: The minimum similarity score (between 0 and 1)
                                        for a dataset location to be considered a match
                                        for a country. Defaults to 0.8.

        Returns:
            A dictionary where keys are dataset location names and values are
            the corresponding ISO 3166-1 alpha-3 country codes.
        """

        def similar(a, b):
            return SequenceMatcher(None, a, b).ratio()

        location_mapping = dict()

        for country in pycountry.countries:
            if country.name not in self.df_tiles.location.unique():
                try:
                    country_quadkey = CountryMercatorTiles.create(
                        country.alpha_3, self.MERCATOR_ZOOM_LEVEL
                    )
                except:
                    self.logger.warning(f"{country.name} is not mapped.")
                    continue
                country_datasets = country_quadkey.filter_quadkeys(
                    self.df_tiles.quadkey
                )
                matching_locations = self.df_tiles[
                    self.df_tiles.quadkey.isin(country_datasets.quadkeys)
                ].location.unique()
                scores = np.array(
                    [
                        (
                            similar(c, country.common_name)
                            if hasattr(country, "common_name")
                            else similar(c, country.name)
                        )
                        for c in matching_locations
                    ]
                )
                if any(scores > similarity_score_threshold):
                    matched = matching_locations[scores > similarity_score_threshold]
                    if len(matched) > 2:
                        self.logger.warning(
                            f"Multiple matches exist for {country.name}. {country.name} is not mapped."
                        )
                    location_mapping[matched[0]] = country.alpha_3
                    self.logger.debug(f"{country.name} matched with {matched[0]}!")
                else:
                    self.logger.warning(
                        f"No direct matches for {country.name}. {country.name} is not mapped."
                    )
                    self.logger.debug("Possible matches are: ")
                    for c, score in zip(matching_locations, scores):
                        self.logger.debug(c, score)
            else:
                location_mapping[country.name] = country.alpha_3

        return location_mapping

    def get_relevant_data_units_by_geometry(
        self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
    ) -> pd.DataFrame:
        """
        Get the DataFrame of Microsoft Buildings tiles that intersect with a given source spatial geometry.

        In case country given, this method first tries to find tiles directly mapped to the given country.
        If no directly mapped tiles are found and the country is not in the location
        mapping, it attempts to find overlapping tiles by creating Mercator tiles
        for the country and filtering the dataset's tiles.
        """
        source = geometry

        if isinstance(source, str):
            try:
                country_code = pycountry.countries.lookup(source).alpha_3
            except:
                raise ValueError("Invalid`country` value!")

            mask = self.df_tiles["country"] == country_code

            if any(mask):
                return self.df_tiles.loc[
                    mask, ["quadkey", "url", "country", "location"]
                ].to_dict("records")

            self.logger.warning(
                f"The country code '{country_code}' is not directly in the location mapping. "
                "Manually checking for overlapping locations with the country boundary."
            )

            source_tiles = CountryMercatorTiles.create(
                country_code, self.MERCATOR_ZOOM_LEVEL
            )
        else:
            source_tiles = MercatorTiles.from_spatial(
                source=source, zoom_level=self.MERCATOR_ZOOM_LEVEL
            )

        filtered_tiles = source_tiles.filter_quadkeys(self.df_tiles.quadkey)

        mask = self.df_tiles.quadkey.isin(filtered_tiles.quadkeys)

        return self.df_tiles.loc[
            mask, ["quadkey", "url", "country", "location"]
        ].to_dict("records")

    def get_data_unit_path(self, unit: Union[pd.Series, dict], **kwargs) -> Path:

        tile_location = unit["country"] if unit["country"] else unit["location"]

        return (
            self.base_path
            / tile_location
            / self.upload_date
            / f'{unit["quadkey"]}.csv.gz'
        )

    def get_data_unit_paths(
        self, units: Union[pd.DataFrame, Iterable[dict]], **kwargs
    ) -> List:
        if isinstance(units, pd.DataFrame):
            return [self.get_data_unit_path(row) for _, row in units.iterrows()]
        return super().get_data_unit_paths(units)

    def extract_search_geometry(self, source, **kwargs):
        """Override the method since geometry will be extracted by MercatorTiles"""
        return source
__post_init__()

Initialize the configuration, load tile URLs, and set up location mapping.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def __post_init__(self):
    """Initialize the configuration, load tile URLs, and set up location mapping."""
    super().__post_init__()
    self._load_tile_urls()
    self.upload_date = self.df_tiles.upload_date[0]
    self._setup_location_mapping()
create_location_mapping(similarity_score_threshold=0.8)

Create a mapping between the dataset's location names and ISO 3166-1 alpha-3 country codes.

This function iterates through known countries and attempts to find matching locations in the dataset based on string similarity.

Parameters:

Name Type Description Default
similarity_score_threshold float

The minimum similarity score (between 0 and 1) for a dataset location to be considered a match for a country. Defaults to 0.8.

0.8

Returns:

Type Description

A dictionary where keys are dataset location names and values are

the corresponding ISO 3166-1 alpha-3 country codes.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def create_location_mapping(self, similarity_score_threshold: float = 0.8):
    """
    Create a mapping between the dataset's location names and ISO 3166-1 alpha-3 country codes.

    This function iterates through known countries and attempts to find matching
    locations in the dataset based on string similarity.

    Args:
        similarity_score_threshold: The minimum similarity score (between 0 and 1)
                                    for a dataset location to be considered a match
                                    for a country. Defaults to 0.8.

    Returns:
        A dictionary where keys are dataset location names and values are
        the corresponding ISO 3166-1 alpha-3 country codes.
    """

    def similar(a, b):
        return SequenceMatcher(None, a, b).ratio()

    location_mapping = dict()

    for country in pycountry.countries:
        if country.name not in self.df_tiles.location.unique():
            try:
                country_quadkey = CountryMercatorTiles.create(
                    country.alpha_3, self.MERCATOR_ZOOM_LEVEL
                )
            except:
                self.logger.warning(f"{country.name} is not mapped.")
                continue
            country_datasets = country_quadkey.filter_quadkeys(
                self.df_tiles.quadkey
            )
            matching_locations = self.df_tiles[
                self.df_tiles.quadkey.isin(country_datasets.quadkeys)
            ].location.unique()
            scores = np.array(
                [
                    (
                        similar(c, country.common_name)
                        if hasattr(country, "common_name")
                        else similar(c, country.name)
                    )
                    for c in matching_locations
                ]
            )
            if any(scores > similarity_score_threshold):
                matched = matching_locations[scores > similarity_score_threshold]
                if len(matched) > 2:
                    self.logger.warning(
                        f"Multiple matches exist for {country.name}. {country.name} is not mapped."
                    )
                location_mapping[matched[0]] = country.alpha_3
                self.logger.debug(f"{country.name} matched with {matched[0]}!")
            else:
                self.logger.warning(
                    f"No direct matches for {country.name}. {country.name} is not mapped."
                )
                self.logger.debug("Possible matches are: ")
                for c, score in zip(matching_locations, scores):
                    self.logger.debug(c, score)
        else:
            location_mapping[country.name] = country.alpha_3

    return location_mapping
extract_search_geometry(source, **kwargs)

Override the method since geometry will be extracted by MercatorTiles

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def extract_search_geometry(self, source, **kwargs):
    """Override the method since geometry will be extracted by MercatorTiles"""
    return source
get_relevant_data_units_by_geometry(geometry, **kwargs)

Get the DataFrame of Microsoft Buildings tiles that intersect with a given source spatial geometry.

In case country given, this method first tries to find tiles directly mapped to the given country. If no directly mapped tiles are found and the country is not in the location mapping, it attempts to find overlapping tiles by creating Mercator tiles for the country and filtering the dataset's tiles.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def get_relevant_data_units_by_geometry(
    self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
) -> pd.DataFrame:
    """
    Get the DataFrame of Microsoft Buildings tiles that intersect with a given source spatial geometry.

    In case country given, this method first tries to find tiles directly mapped to the given country.
    If no directly mapped tiles are found and the country is not in the location
    mapping, it attempts to find overlapping tiles by creating Mercator tiles
    for the country and filtering the dataset's tiles.
    """
    source = geometry

    if isinstance(source, str):
        try:
            country_code = pycountry.countries.lookup(source).alpha_3
        except:
            raise ValueError("Invalid`country` value!")

        mask = self.df_tiles["country"] == country_code

        if any(mask):
            return self.df_tiles.loc[
                mask, ["quadkey", "url", "country", "location"]
            ].to_dict("records")

        self.logger.warning(
            f"The country code '{country_code}' is not directly in the location mapping. "
            "Manually checking for overlapping locations with the country boundary."
        )

        source_tiles = CountryMercatorTiles.create(
            country_code, self.MERCATOR_ZOOM_LEVEL
        )
    else:
        source_tiles = MercatorTiles.from_spatial(
            source=source, zoom_level=self.MERCATOR_ZOOM_LEVEL
        )

    filtered_tiles = source_tiles.filter_quadkeys(self.df_tiles.quadkey)

    mask = self.df_tiles.quadkey.isin(filtered_tiles.quadkeys)

    return self.df_tiles.loc[
        mask, ["quadkey", "url", "country", "location"]
    ].to_dict("records")

MSBuildingsDownloader

Bases: BaseHandlerDownloader

A class to handle downloads of Microsoft's Global ML Building Footprints dataset.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
class MSBuildingsDownloader(BaseHandlerDownloader):
    """A class to handle downloads of Microsoft's Global ML Building Footprints dataset."""

    def __init__(
        self,
        config: Optional[MSBuildingsConfig] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        """
        Initialize the downloader.

        Args:
            config: Optional configuration for customizing download behavior and file paths.
                    If None, a default `MSBuildingsConfig` is used.
            data_store: Optional instance of a `DataStore` for managing data storage.
                        If provided, it overrides the `data_store` in the `config`.
                        If None, the `data_store` from the `config` is used.
            logger: Optional custom logger instance. If None, a default logger
                    named after the module is created and used.
        """
        config = config or MSBuildingsConfig()
        super().__init__(config=config, data_store=data_store, logger=logger)

    def download_data_unit(
        self,
        tile_info: Union[pd.Series, dict],
        **kwargs,
    ) -> Optional[str]:
        """Download data file for a single tile."""

        tile_url = tile_info["url"]

        try:
            response = requests.get(tile_url, stream=True)
            response.raise_for_status()

            file_path = str(self.config.get_data_unit_path(tile_info))

            with self.data_store.open(file_path, "wb") as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)

                self.logger.debug(
                    f"Successfully downloaded tile: {tile_info['quadkey']}"
                )
                return file_path

        except requests.exceptions.RequestException as e:
            self.logger.error(
                f"Failed to download tile {tile_info['quadkey']}: {str(e)}"
            )
            return None
        except Exception as e:
            self.logger.error(f"Unexpected error downloading dataset: {str(e)}")
            return None

    def download_data_units(
        self,
        tiles: Union[pd.DataFrame, List[dict]],
        **kwargs,
    ) -> List[str]:
        """Download data files for multiple tiles."""

        if len(tiles) == 0:
            self.logger.warning(f"There is no matching data")
            return []

        with multiprocessing.Pool(self.config.n_workers) as pool:
            download_func = functools.partial(self.download_data_unit)
            file_paths = list(
                tqdm(
                    pool.imap(
                        download_func,
                        (
                            [row for _, row in tiles.iterrows()]
                            if isinstance(tiles, pd.DataFrame)
                            else tiles
                        ),
                    ),
                    total=len(tiles),
                    desc=f"Downloading polygons data",
                )
            )

        return [path for path in file_paths if path is not None]

    def download_by_country(
        self,
        country: str,
        data_store: Optional[DataStore] = None,
        country_geom_path: Optional[Union[str, Path]] = None,
    ) -> List[str]:
        """
        Download Microsoft Global ML Building Footprints data for a specific country.

        This is a convenience method to download data for an entire country
        using its code or name.

        Args:
            country: The country code (e.g., 'USA', 'GBR') or name.
            data_store: Optional instance of a `DataStore` to be used by
                `AdminBoundaries` for loading country boundaries. If None,
                `AdminBoundaries` will use its default data loading.
            country_geom_path: Optional path to a GeoJSON file containing the
                country boundary. If provided, this boundary is used
                instead of the default from `AdminBoundaries`.

        Returns:
            A list of local file paths for the successfully downloaded tiles.
            Returns an empty list if no data is found for the country or if
            all downloads fail.
        """
        return self.download(
            source=country, data_store=data_store, path=country_geom_path
        )
__init__(config=None, data_store=None, logger=None)

Initialize the downloader.

Parameters:

Name Type Description Default
config Optional[MSBuildingsConfig]

Optional configuration for customizing download behavior and file paths. If None, a default MSBuildingsConfig is used.

None
data_store Optional[DataStore]

Optional instance of a DataStore for managing data storage. If provided, it overrides the data_store in the config. If None, the data_store from the config is used.

None
logger Optional[Logger]

Optional custom logger instance. If None, a default logger named after the module is created and used.

None
Source code in gigaspatial/handlers/microsoft_global_buildings.py
def __init__(
    self,
    config: Optional[MSBuildingsConfig] = None,
    data_store: Optional[DataStore] = None,
    logger: Optional[logging.Logger] = None,
):
    """
    Initialize the downloader.

    Args:
        config: Optional configuration for customizing download behavior and file paths.
                If None, a default `MSBuildingsConfig` is used.
        data_store: Optional instance of a `DataStore` for managing data storage.
                    If provided, it overrides the `data_store` in the `config`.
                    If None, the `data_store` from the `config` is used.
        logger: Optional custom logger instance. If None, a default logger
                named after the module is created and used.
    """
    config = config or MSBuildingsConfig()
    super().__init__(config=config, data_store=data_store, logger=logger)
download_by_country(country, data_store=None, country_geom_path=None)

Download Microsoft Global ML Building Footprints data for a specific country.

This is a convenience method to download data for an entire country using its code or name.

Parameters:

Name Type Description Default
country str

The country code (e.g., 'USA', 'GBR') or name.

required
data_store Optional[DataStore]

Optional instance of a DataStore to be used by AdminBoundaries for loading country boundaries. If None, AdminBoundaries will use its default data loading.

None
country_geom_path Optional[Union[str, Path]]

Optional path to a GeoJSON file containing the country boundary. If provided, this boundary is used instead of the default from AdminBoundaries.

None

Returns:

Type Description
List[str]

A list of local file paths for the successfully downloaded tiles.

List[str]

Returns an empty list if no data is found for the country or if

List[str]

all downloads fail.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def download_by_country(
    self,
    country: str,
    data_store: Optional[DataStore] = None,
    country_geom_path: Optional[Union[str, Path]] = None,
) -> List[str]:
    """
    Download Microsoft Global ML Building Footprints data for a specific country.

    This is a convenience method to download data for an entire country
    using its code or name.

    Args:
        country: The country code (e.g., 'USA', 'GBR') or name.
        data_store: Optional instance of a `DataStore` to be used by
            `AdminBoundaries` for loading country boundaries. If None,
            `AdminBoundaries` will use its default data loading.
        country_geom_path: Optional path to a GeoJSON file containing the
            country boundary. If provided, this boundary is used
            instead of the default from `AdminBoundaries`.

    Returns:
        A list of local file paths for the successfully downloaded tiles.
        Returns an empty list if no data is found for the country or if
        all downloads fail.
    """
    return self.download(
        source=country, data_store=data_store, path=country_geom_path
    )
download_data_unit(tile_info, **kwargs)

Download data file for a single tile.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def download_data_unit(
    self,
    tile_info: Union[pd.Series, dict],
    **kwargs,
) -> Optional[str]:
    """Download data file for a single tile."""

    tile_url = tile_info["url"]

    try:
        response = requests.get(tile_url, stream=True)
        response.raise_for_status()

        file_path = str(self.config.get_data_unit_path(tile_info))

        with self.data_store.open(file_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)

            self.logger.debug(
                f"Successfully downloaded tile: {tile_info['quadkey']}"
            )
            return file_path

    except requests.exceptions.RequestException as e:
        self.logger.error(
            f"Failed to download tile {tile_info['quadkey']}: {str(e)}"
        )
        return None
    except Exception as e:
        self.logger.error(f"Unexpected error downloading dataset: {str(e)}")
        return None
download_data_units(tiles, **kwargs)

Download data files for multiple tiles.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def download_data_units(
    self,
    tiles: Union[pd.DataFrame, List[dict]],
    **kwargs,
) -> List[str]:
    """Download data files for multiple tiles."""

    if len(tiles) == 0:
        self.logger.warning(f"There is no matching data")
        return []

    with multiprocessing.Pool(self.config.n_workers) as pool:
        download_func = functools.partial(self.download_data_unit)
        file_paths = list(
            tqdm(
                pool.imap(
                    download_func,
                    (
                        [row for _, row in tiles.iterrows()]
                        if isinstance(tiles, pd.DataFrame)
                        else tiles
                    ),
                ),
                total=len(tiles),
                desc=f"Downloading polygons data",
            )
        )

    return [path for path in file_paths if path is not None]

MSBuildingsHandler

Bases: BaseHandler

Handler for Microsoft Global Buildings dataset.

This class provides a unified interface for downloading and loading Microsoft Global Buildings data. It manages the lifecycle of configuration, downloading, and reading components.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
class MSBuildingsHandler(BaseHandler):
    """
    Handler for Microsoft Global Buildings dataset.

    This class provides a unified interface for downloading and loading Microsoft Global Buildings data.
    It manages the lifecycle of configuration, downloading, and reading components.
    """

    def create_config(
        self, data_store: DataStore, logger: logging.Logger, **kwargs
    ) -> MSBuildingsConfig:
        """
        Create and return a MSBuildingsConfig instance.

        Args:
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional configuration parameters

        Returns:
            Configured MSBuildingsConfig instance
        """
        return MSBuildingsConfig(data_store=data_store, logger=logger, **kwargs)

    def create_downloader(
        self,
        config: MSBuildingsConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> MSBuildingsDownloader:
        """
        Create and return a MSBuildingsDownloader instance.

        Args:
            config: The configuration object
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional downloader parameters

        Returns:
            Configured MSBuildingsDownloader instance
        """
        return MSBuildingsDownloader(
            config=config, data_store=data_store, logger=logger, **kwargs
        )

    def create_reader(
        self,
        config: MSBuildingsConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> MSBuildingsReader:
        """
        Create and return a MSBuildingsReader instance.

        Args:
            config: The configuration object
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional reader parameters

        Returns:
            Configured MSBuildingsReader instance
        """
        return MSBuildingsReader(
            config=config, data_store=data_store, logger=logger, **kwargs
        )
create_config(data_store, logger, **kwargs)

Create and return a MSBuildingsConfig instance.

Parameters:

Name Type Description Default
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional configuration parameters

{}

Returns:

Type Description
MSBuildingsConfig

Configured MSBuildingsConfig instance

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def create_config(
    self, data_store: DataStore, logger: logging.Logger, **kwargs
) -> MSBuildingsConfig:
    """
    Create and return a MSBuildingsConfig instance.

    Args:
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional configuration parameters

    Returns:
        Configured MSBuildingsConfig instance
    """
    return MSBuildingsConfig(data_store=data_store, logger=logger, **kwargs)
create_downloader(config, data_store, logger, **kwargs)

Create and return a MSBuildingsDownloader instance.

Parameters:

Name Type Description Default
config MSBuildingsConfig

The configuration object

required
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional downloader parameters

{}

Returns:

Type Description
MSBuildingsDownloader

Configured MSBuildingsDownloader instance

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def create_downloader(
    self,
    config: MSBuildingsConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> MSBuildingsDownloader:
    """
    Create and return a MSBuildingsDownloader instance.

    Args:
        config: The configuration object
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional downloader parameters

    Returns:
        Configured MSBuildingsDownloader instance
    """
    return MSBuildingsDownloader(
        config=config, data_store=data_store, logger=logger, **kwargs
    )
create_reader(config, data_store, logger, **kwargs)

Create and return a MSBuildingsReader instance.

Parameters:

Name Type Description Default
config MSBuildingsConfig

The configuration object

required
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional reader parameters

{}

Returns:

Type Description
MSBuildingsReader

Configured MSBuildingsReader instance

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def create_reader(
    self,
    config: MSBuildingsConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> MSBuildingsReader:
    """
    Create and return a MSBuildingsReader instance.

    Args:
        config: The configuration object
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional reader parameters

    Returns:
        Configured MSBuildingsReader instance
    """
    return MSBuildingsReader(
        config=config, data_store=data_store, logger=logger, **kwargs
    )

MSBuildingsReader

Bases: BaseHandlerReader

Reader for Microsoft Global Buildings data, supporting country, points, and geometry-based resolution.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
class MSBuildingsReader(BaseHandlerReader):
    """
    Reader for Microsoft Global Buildings data, supporting country, points, and geometry-based resolution.
    """

    def __init__(
        self,
        config: Optional[MSBuildingsConfig] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        config = config or MSBuildingsConfig()
        super().__init__(config=config, data_store=data_store, logger=logger)

    def load_from_paths(
        self, source_data_path: List[Union[str, Path]], **kwargs
    ) -> gpd.GeoDataFrame:
        """
        Load building data from Microsoft Buildings dataset.
        Args:
            source_data_path: List of file paths to load
        Returns:
            GeoDataFrame containing building data
        """
        from gigaspatial.core.io.readers import read_gzipped_json_or_csv
        from shapely.geometry import shape

        def read_ms_dataset(data_store: DataStore, file_path: str):
            df = read_gzipped_json_or_csv(file_path=file_path, data_store=data_store)
            df["geometry"] = df["geometry"].apply(shape)
            return gpd.GeoDataFrame(df, crs=4326)

        result = self._load_tabular_data(
            file_paths=source_data_path, read_function=read_ms_dataset
        )
        return result
load_from_paths(source_data_path, **kwargs)

Load building data from Microsoft Buildings dataset. Args: source_data_path: List of file paths to load Returns: GeoDataFrame containing building data

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def load_from_paths(
    self, source_data_path: List[Union[str, Path]], **kwargs
) -> gpd.GeoDataFrame:
    """
    Load building data from Microsoft Buildings dataset.
    Args:
        source_data_path: List of file paths to load
    Returns:
        GeoDataFrame containing building data
    """
    from gigaspatial.core.io.readers import read_gzipped_json_or_csv
    from shapely.geometry import shape

    def read_ms_dataset(data_store: DataStore, file_path: str):
        df = read_gzipped_json_or_csv(file_path=file_path, data_store=data_store)
        df["geometry"] = df["geometry"].apply(shape)
        return gpd.GeoDataFrame(df, crs=4326)

    result = self._load_tabular_data(
        file_paths=source_data_path, read_function=read_ms_dataset
    )
    return result

ookla_speedtest

OoklaSpeedtestConfig dataclass

Bases: BaseHandlerConfig

Configuration class for Ookla Speedtest data.

This class defines the parameters for accessing and filtering Ookla Speedtest datasets, including available years, quarters, and how dataset URLs are constructed.

Source code in gigaspatial/handlers/ookla_speedtest.py
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class OoklaSpeedtestConfig(BaseHandlerConfig):
    """
    Configuration class for Ookla Speedtest data.

    This class defines the parameters for accessing and filtering Ookla Speedtest datasets,
    including available years, quarters, and how dataset URLs are constructed.
    """

    MIN_YEAR = 2019
    MAX_YEAR = datetime.today().year
    MAX_QUARTER = int(np.floor((datetime.today().month - 1) / 3))
    if MAX_QUARTER == 0:
        MAX_YEAR -= 1
        MAX_QUARTER = 4

    BASE_URL = "https://ookla-open-data.s3.amazonaws.com/parquet/performance"

    base_path: Path = Field(default=config.get_path("ookla_speedtest", "bronze"))

    type: Literal["fixed", "mobile"] = Field(...)
    year: Optional[int] = Field(default=None, ge=MIN_YEAR, le=MAX_YEAR)
    quarter: Optional[int] = Field(default=None, ge=0, le=4)

    def __post_init__(self):
        if self.year is None:
            self.year = self.MAX_YEAR
            self.logger.warning(
                "Year not provided. Using the latest available data year: %s", self.year
            )
        if self.quarter is None:
            self.quarter = self.MAX_QUARTER
            self.logger.warning(
                "Quarter not provided. Using the latest available data quarter for year %s: %s",
                self.year,
                self.quarter,
            )

        super().__post_init__()
        self.DATASET_URL = self._get_dataset_url(self.type, self.year, self.quarter)

    def _get_dataset_url(self, type, year, quarter):
        month = [1, 4, 7, 10]
        quarter_start = datetime(year, month[self.quarter - 1], 1)
        return f"{self.BASE_URL}/type={type}/year={quarter_start:%Y}/quarter={quarter}/{quarter_start:%Y-%m-%d}_performance_{type}_tiles.parquet"

    @staticmethod
    def get_available_datasets():
        start_year = 2019  # first data year
        max_year = datetime.today().year
        max_quarter = np.floor((datetime.today().month - 1) / 3)
        if max_quarter == 0:
            max_year -= 1
            max_quarter = 4

        ookla_tiles = []
        for year in range(start_year, max_year + 1):
            for quarter in range(1, 5):
                if year == max_year and quarter > max_quarter:
                    continue
                for type in ["fixed", "mobile"]:
                    ookla_tiles.append(
                        {"service_type": type, "year": year, "quarter": quarter}
                    )

        return ookla_tiles

    def get_relevant_data_units(self, source=None, **kwargs):
        return [self.DATASET_URL]

    def get_relevant_data_units_by_geometry(
        self, geometry: Union[BaseGeometry, gpd.GeoDataFrame] = None, **kwargs
    ) -> List[str]:
        return

    def get_data_unit_path(self, unit: str, **kwargs) -> Path:
        """
        Given a Ookla Speedtest file url, return the corresponding path.
        """
        return self.base_path / unit.split("/")[-1]
get_data_unit_path(unit, **kwargs)

Given a Ookla Speedtest file url, return the corresponding path.

Source code in gigaspatial/handlers/ookla_speedtest.py
def get_data_unit_path(self, unit: str, **kwargs) -> Path:
    """
    Given a Ookla Speedtest file url, return the corresponding path.
    """
    return self.base_path / unit.split("/")[-1]

OoklaSpeedtestDownloader

Bases: BaseHandlerDownloader

A class to handle the downloading of Ookla Speedtest data.

This downloader focuses on fetching parquet files based on the provided configuration and data unit URLs.

Source code in gigaspatial/handlers/ookla_speedtest.py
class OoklaSpeedtestDownloader(BaseHandlerDownloader):
    """
    A class to handle the downloading of Ookla Speedtest data.

    This downloader focuses on fetching parquet files based on the provided configuration
    and data unit URLs.
    """

    def __init__(
        self,
        config: Union[OoklaSpeedtestConfig, dict[str, Union[str, int]]],
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        config = (
            config
            if isinstance(config, OoklaSpeedtestConfig)
            else OoklaSpeedtestConfig(**config)
        )
        super().__init__(config=config, data_store=data_store, logger=logger)

    def download_data_unit(self, url: str, **kwargs) -> Optional[Path]:
        output_path = self.config.get_data_unit_path(url)

        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()

            total_size = int(response.headers.get("content-length", 0))

            with self.data_store.open(str(output_path), "wb") as file:
                for chunk in tqdm(
                    response.iter_content(chunk_size=8192),
                    total=total_size // 8192,
                    unit="KB",
                    desc=f"Downloading {output_path.name}",
                ):
                    file.write(chunk)

            self.logger.info(f"Successfully downloaded: {url} to {output_path}")
            return output_path

        except requests.exceptions.RequestException as e:
            self.logger.error(f"Failed to download {url}: {str(e)}")
            return None
        except Exception as e:
            self.logger.error(f"Unexpected error downloading {url}: {str(e)}")
            return None

    def download_data_units(self, urls: List[str], **kwargs) -> List[Optional[Path]]:
        # Ookla data is not parallelizable in a meaningful way beyond single file, so just iterate.
        results = [self.download_data_unit(url, **kwargs) for url in urls]
        return [path for path in results if path is not None]

    def download(
        self, source: Optional[Union[str, List[str]]] = None, **kwargs
    ) -> List[Optional[Path]]:
        urls = self.config.get_relevant_data_units(source)
        return self.download_data_units(urls, **kwargs)

OoklaSpeedtestHandler

Bases: BaseHandler

Handler for Ookla Speedtest data.

This class orchestrates the configuration, downloading, and reading of Ookla Speedtest data, allowing for filtering by geographical sources using Mercator tiles.

Source code in gigaspatial/handlers/ookla_speedtest.py
class OoklaSpeedtestHandler(BaseHandler):
    """
    Handler for Ookla Speedtest data.

    This class orchestrates the configuration, downloading, and reading of Ookla Speedtest
    data, allowing for filtering by geographical sources using Mercator tiles.
    """

    def __init__(
        self,
        type: Literal["fixed", "mobile"],
        year: Optional[int] = None,
        quarter: Optional[int] = None,
        config: Optional[OoklaSpeedtestConfig] = None,
        downloader: Optional[OoklaSpeedtestDownloader] = None,
        reader: Optional[OoklaSpeedtestReader] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
        **kwargs,
    ):
        self._type = type
        self._year = year
        self._quarter = quarter

        super().__init__(
            config=config,
            downloader=downloader,
            reader=reader,
            data_store=data_store,
            logger=logger,
        )

    def create_config(
        self, data_store: DataStore, logger: logging.Logger, **kwargs
    ) -> OoklaSpeedtestConfig:
        return OoklaSpeedtestConfig(
            type=self._type,
            year=self._year,
            quarter=self._quarter,
            data_store=data_store,
            logger=logger,
            **kwargs,
        )

    def create_downloader(
        self,
        config: OoklaSpeedtestConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> OoklaSpeedtestDownloader:
        return OoklaSpeedtestDownloader(
            config=config, data_store=data_store, logger=logger, **kwargs
        )

    def create_reader(
        self,
        config: OoklaSpeedtestConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> OoklaSpeedtestReader:
        return OoklaSpeedtestReader(
            config=config, data_store=data_store, logger=logger, **kwargs
        )

    def load_data(
        self,
        source: Union[
            str,  # country
            List[Union[Tuple[float, float], Point]],  # points
            BaseGeometry,  # geometry
            gpd.GeoDataFrame,  # geodataframe
            Path,  # path
            str,  # path
            List[Union[str, Path]],
        ] = None,
        process_geospatial: bool = False,
        ensure_available: bool = True,
        **kwargs,
    ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:

        if source is None or (
            isinstance(source, (str, Path))
            and (
                self.data_store.file_exists(str(source))
                or str(source).endswith(".parquet")
            )
            or (
                isinstance(source, List)
                and all(isinstance(p, (str, Path)) for p in source)
            )
        ):
            # If no source or source is a direct path, load without filtering
            result = super().load_data(source, ensure_available, **kwargs)
        else:
            # Load the entire dataset and then apply Mercator tile filtering
            full_dataset = super().load_data(
                None, ensure_available, **kwargs
            )  # Load the full dataset (uses DATASET_URL)

            key = self.config._cache_key(source, **kwargs)

            # Check cache unless forced recompute
            if (
                not kwargs.get("force_recompute", False)
                and key in self.config._unit_cache
            ):
                self.logger.debug(
                    f"Using cached quadkeys for {key[0]}: {key[1][:50]}..."
                )
                quadkeys = self.config._unit_cache[key]

            else:

                if isinstance(source, str):  # country
                    mercator_tiles = CountryMercatorTiles.create(
                        source, zoom_level=16, **kwargs
                    )
                elif isinstance(source, (BaseGeometry, gpd.GeoDataFrame, List)):
                    mercator_tiles = MercatorTiles.from_spatial(
                        source, zoom_level=16, **kwargs
                    )
                else:
                    raise ValueError(
                        f"Unsupported source type for filtering: {type(source)}"
                    )

                quadkeys = mercator_tiles.quadkeys

                # Cache the result
                self.config._unit_cache[key] = quadkeys

            result = full_dataset[full_dataset["quadkey"].isin(quadkeys)].reset_index(
                drop=True
            )

        if process_geospatial:
            # Convert 'tile' column from WKT to geometry
            result["geometry"] = result["tile"].apply(wkt.loads)
            return gpd.GeoDataFrame(result, geometry="geometry", crs="EPSG:4326")

        return result

OoklaSpeedtestReader

Bases: BaseHandlerReader

A class to handle reading Ookla Speedtest data.

It loads parquet files into a DataFrame.

Source code in gigaspatial/handlers/ookla_speedtest.py
class OoklaSpeedtestReader(BaseHandlerReader):
    """
    A class to handle reading Ookla Speedtest data.

    It loads parquet files into a DataFrame.
    """

    def __init__(
        self,
        config: Union[OoklaSpeedtestConfig, dict[str, Union[str, int]]],
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        config = (
            config
            if isinstance(config, OoklaSpeedtestConfig)
            else OoklaSpeedtestConfig(**config)
        )
        super().__init__(config=config, data_store=data_store, logger=logger)

    def load_from_paths(
        self, source_data_path: List[Union[str, Path]], **kwargs
    ) -> pd.DataFrame:
        result = self._load_tabular_data(file_paths=source_data_path)
        return result

    def load(
        self,
        source: Optional[
            Union[
                str,  # country
                List[Union[Tuple[float, float], Point]],  # points
                BaseGeometry,  # geometry
                gpd.GeoDataFrame,  # geodataframe
                Path,  # path
                str,  # path
                List[Union[str, Path]],
            ]
        ] = None,
        **kwargs,
    ) -> pd.DataFrame:
        return super().load(source=source, **kwargs)

opencellid

OpenCellIDConfig

Bases: BaseModel

Configuration for OpenCellID data access

Source code in gigaspatial/handlers/opencellid.py
class OpenCellIDConfig(BaseModel):
    """Configuration for OpenCellID data access"""

    # Base URLs
    BASE_URL: HttpUrl = Field(default="https://opencellid.org/")
    DOWNLOAD_URL: HttpUrl = Field(default="https://opencellid.org/downloads.php?token=")

    # User configuration
    country: str = Field(...)
    api_token: str = Field(
        default=global_config.OPENCELLID_ACCESS_TOKEN,
        description="OpenCellID API Access Token",
    )
    base_path: Path = Field(default=global_config.get_path("opencellid", "bronze"))
    created_newer: int = Field(
        default=2003, description="Filter out cell towers added before this year"
    )
    created_before: int = Field(
        default=datetime.now().year,
        description="Filter out cell towers added after this year",
    )
    drop_duplicates: bool = Field(
        default=True,
        description="Drop cells that are in the exact same location and radio technology",
    )

    @field_validator("country")
    def validate_country(cls, value: str) -> str:
        try:
            return pycountry.countries.lookup(value).alpha_3
        except LookupError:
            raise ValueError(f"Invalid country code provided: {value}")

    @property
    def output_file_path(self) -> Path:
        """Path to save the downloaded OpenCellID data"""
        return self.base_path / f"opencellid_{self.country.lower()}.csv.gz"

    def __repr__(self) -> str:
        return (
            f"OpenCellIDConfig(\n"
            f"  country='{self.country}'\n"
            f"  created_newer={self.created_newer}\n"
            f"  created_before={self.created_before}\n"
            f"  drop_duplicates={self.drop_duplicates}\n"
            f")"
        )
output_file_path: Path property

Path to save the downloaded OpenCellID data

OpenCellIDDownloader

Downloader for OpenCellID data

Source code in gigaspatial/handlers/opencellid.py
class OpenCellIDDownloader:
    """Downloader for OpenCellID data"""

    def __init__(
        self,
        config: Union[OpenCellIDConfig, dict],
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        if isinstance(config, dict):
            self.config = OpenCellIDConfig(**config)
        else:
            self.config = config

        self.data_store = data_store or LocalDataStore()
        self.logger = logger or global_config.get_logger(self.__class__.__name__)

    @classmethod
    def from_country(
        cls,
        country: str,
        api_token: str = global_config.OPENCELLID_ACCESS_TOKEN,
        **kwargs,
    ):
        """Create a downloader for a specific country"""
        config = OpenCellIDConfig(country=country, api_token=api_token, **kwargs)
        return cls(config=config)

    def get_download_links(self) -> List[str]:
        """Get download links for the country from OpenCellID website"""
        url = f"{self.config.DOWNLOAD_URL}{self.config.api_token}"
        country_alpha2 = pycountry.countries.get(
            alpha_3=self.config.country.upper()
        ).alpha_2

        try:
            # Find table with cell tower data links
            self.logger.info(f"Fetching download links for {self.config.country}")
            html_content = requests.get(url).text
            soup = BeautifulSoup(html_content, "lxml")
            table = soup.find("table", {"id": "regions"})

            if not table:
                raise ValueError(
                    "Could not find cell tower data table on OpenCellID website"
                )

            # Parse table headers
            t_headers = []
            for th in table.find_all("th"):
                t_headers.append(th.text.replace("\n", " ").strip())

            # Parse table data
            table_data = []
            for tr in table.tbody.find_all("tr"):
                t_row = {}

                for td, th in zip(tr.find_all("td"), t_headers):
                    if "Files" in th:
                        t_row[th] = []
                        for a in td.find_all("a"):
                            t_row[th].append(a.get("href"))
                    else:
                        t_row[th] = td.text.replace("\n", "").strip()

                table_data.append(t_row)

            cell_dict = pd.DataFrame(table_data)

            # Get links for the country code
            if country_alpha2 not in cell_dict["Country Code"].values:
                raise ValueError(
                    f"Country code {country_alpha2} not found in OpenCellID database"
                )
            else:
                links = cell_dict[cell_dict["Country Code"] == country_alpha2][
                    "Files (grouped by MCC)"
                ].values[0]

            return links

        except Exception as e:
            self.logger.error(f"Error fetching download links: {str(e)}")
            raise

    def download_and_process(self) -> str:
        """Download and process OpenCellID data for the configured country"""

        try:
            links = self.get_download_links()
            self.logger.info(f"Found {len(links)} data files for {self.config.country}")

            dfs = []

            for link in links:
                self.logger.info(f"Downloading data from {link}")
                response = requests.get(link, stream=True)
                response.raise_for_status()

                # Use a temporary file for download
                with tempfile.NamedTemporaryFile(delete=False, suffix=".gz") as tmpfile:
                    for chunk in response.iter_content(chunk_size=1024):
                        if chunk:
                            tmpfile.write(chunk)
                    temp_file = tmpfile.name

                try:
                    # Read the downloaded gzipped CSV data
                    with gzip.open(temp_file, "rt") as feed_data:
                        dfs.append(
                            pd.read_csv(
                                feed_data,
                                names=[
                                    "radio",
                                    "mcc",
                                    "net",
                                    "area",
                                    "cell",
                                    "unit",
                                    "lon",
                                    "lat",
                                    "range",
                                    "samples",
                                    "changeable",
                                    "created",
                                    "updated",
                                    "average_signal",
                                ],
                            )
                        )
                except IOError as e:
                    with open(temp_file, "r") as error_file:
                        contents = error_file.readline()

                    if "RATE_LIMITED" in contents:
                        raise RuntimeError(
                            "API rate limit exceeded. You're rate-limited!"
                        )
                    elif "INVALID_TOKEN" in contents:
                        raise RuntimeError("API token rejected by OpenCellID!")
                    else:
                        raise RuntimeError(
                            f"Error processing downloaded data: {str(e)}"
                        )
                finally:
                    # Clean up temporary file
                    if os.path.exists(temp_file):
                        os.remove(temp_file)

            df_cell = pd.concat(dfs, ignore_index=True)

            # Process the data
            if not df_cell.empty:
                # Convert timestamps to datetime
                df_cell["created"] = pd.to_datetime(
                    df_cell["created"], unit="s", origin="unix"
                )
                df_cell["updated"] = pd.to_datetime(
                    df_cell["updated"], unit="s", origin="unix"
                )

                # Filter by year
                df_cell = df_cell[
                    (df_cell.created.dt.year >= self.config.created_newer)
                    & (df_cell.created.dt.year < self.config.created_before)
                ]

                # Drop duplicates if configured
                if self.config.drop_duplicates:
                    df_cell = (
                        df_cell.groupby(["radio", "lon", "lat"]).first().reset_index()
                    )

                # Save processed data using data_store
                output_path = str(self.config.output_file_path)
                self.logger.info(f"Saving processed data to {output_path}")
                with self.data_store.open(output_path, "wb") as f:
                    df_cell.to_csv(f, compression="gzip", index=False)

                return output_path
            else:
                raise ValueError(f"No data found for {self.config.country}")

        except Exception as e:
            self.logger.error(f"Error downloading and processing data: {str(e)}")
            raise
download_and_process()

Download and process OpenCellID data for the configured country

Source code in gigaspatial/handlers/opencellid.py
def download_and_process(self) -> str:
    """Download and process OpenCellID data for the configured country"""

    try:
        links = self.get_download_links()
        self.logger.info(f"Found {len(links)} data files for {self.config.country}")

        dfs = []

        for link in links:
            self.logger.info(f"Downloading data from {link}")
            response = requests.get(link, stream=True)
            response.raise_for_status()

            # Use a temporary file for download
            with tempfile.NamedTemporaryFile(delete=False, suffix=".gz") as tmpfile:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        tmpfile.write(chunk)
                temp_file = tmpfile.name

            try:
                # Read the downloaded gzipped CSV data
                with gzip.open(temp_file, "rt") as feed_data:
                    dfs.append(
                        pd.read_csv(
                            feed_data,
                            names=[
                                "radio",
                                "mcc",
                                "net",
                                "area",
                                "cell",
                                "unit",
                                "lon",
                                "lat",
                                "range",
                                "samples",
                                "changeable",
                                "created",
                                "updated",
                                "average_signal",
                            ],
                        )
                    )
            except IOError as e:
                with open(temp_file, "r") as error_file:
                    contents = error_file.readline()

                if "RATE_LIMITED" in contents:
                    raise RuntimeError(
                        "API rate limit exceeded. You're rate-limited!"
                    )
                elif "INVALID_TOKEN" in contents:
                    raise RuntimeError("API token rejected by OpenCellID!")
                else:
                    raise RuntimeError(
                        f"Error processing downloaded data: {str(e)}"
                    )
            finally:
                # Clean up temporary file
                if os.path.exists(temp_file):
                    os.remove(temp_file)

        df_cell = pd.concat(dfs, ignore_index=True)

        # Process the data
        if not df_cell.empty:
            # Convert timestamps to datetime
            df_cell["created"] = pd.to_datetime(
                df_cell["created"], unit="s", origin="unix"
            )
            df_cell["updated"] = pd.to_datetime(
                df_cell["updated"], unit="s", origin="unix"
            )

            # Filter by year
            df_cell = df_cell[
                (df_cell.created.dt.year >= self.config.created_newer)
                & (df_cell.created.dt.year < self.config.created_before)
            ]

            # Drop duplicates if configured
            if self.config.drop_duplicates:
                df_cell = (
                    df_cell.groupby(["radio", "lon", "lat"]).first().reset_index()
                )

            # Save processed data using data_store
            output_path = str(self.config.output_file_path)
            self.logger.info(f"Saving processed data to {output_path}")
            with self.data_store.open(output_path, "wb") as f:
                df_cell.to_csv(f, compression="gzip", index=False)

            return output_path
        else:
            raise ValueError(f"No data found for {self.config.country}")

    except Exception as e:
        self.logger.error(f"Error downloading and processing data: {str(e)}")
        raise
from_country(country, api_token=global_config.OPENCELLID_ACCESS_TOKEN, **kwargs) classmethod

Create a downloader for a specific country

Source code in gigaspatial/handlers/opencellid.py
@classmethod
def from_country(
    cls,
    country: str,
    api_token: str = global_config.OPENCELLID_ACCESS_TOKEN,
    **kwargs,
):
    """Create a downloader for a specific country"""
    config = OpenCellIDConfig(country=country, api_token=api_token, **kwargs)
    return cls(config=config)

Get download links for the country from OpenCellID website

Source code in gigaspatial/handlers/opencellid.py
def get_download_links(self) -> List[str]:
    """Get download links for the country from OpenCellID website"""
    url = f"{self.config.DOWNLOAD_URL}{self.config.api_token}"
    country_alpha2 = pycountry.countries.get(
        alpha_3=self.config.country.upper()
    ).alpha_2

    try:
        # Find table with cell tower data links
        self.logger.info(f"Fetching download links for {self.config.country}")
        html_content = requests.get(url).text
        soup = BeautifulSoup(html_content, "lxml")
        table = soup.find("table", {"id": "regions"})

        if not table:
            raise ValueError(
                "Could not find cell tower data table on OpenCellID website"
            )

        # Parse table headers
        t_headers = []
        for th in table.find_all("th"):
            t_headers.append(th.text.replace("\n", " ").strip())

        # Parse table data
        table_data = []
        for tr in table.tbody.find_all("tr"):
            t_row = {}

            for td, th in zip(tr.find_all("td"), t_headers):
                if "Files" in th:
                    t_row[th] = []
                    for a in td.find_all("a"):
                        t_row[th].append(a.get("href"))
                else:
                    t_row[th] = td.text.replace("\n", "").strip()

            table_data.append(t_row)

        cell_dict = pd.DataFrame(table_data)

        # Get links for the country code
        if country_alpha2 not in cell_dict["Country Code"].values:
            raise ValueError(
                f"Country code {country_alpha2} not found in OpenCellID database"
            )
        else:
            links = cell_dict[cell_dict["Country Code"] == country_alpha2][
                "Files (grouped by MCC)"
            ].values[0]

        return links

    except Exception as e:
        self.logger.error(f"Error fetching download links: {str(e)}")
        raise

OpenCellIDReader

Reader for OpenCellID data

Source code in gigaspatial/handlers/opencellid.py
class OpenCellIDReader:
    """Reader for OpenCellID data"""

    def __init__(
        self,
        country: str,
        data_store: Optional[DataStore] = None,
        base_path: Optional[Path] = None,
    ):
        self.country = pycountry.countries.lookup(country).alpha_3
        self.data_store = data_store or LocalDataStore()
        self.base_path = base_path or global_config.get_path("opencellid", "bronze")

    def read_data(self) -> pd.DataFrame:
        """Read OpenCellID data for the specified country"""
        file_path = str(self.base_path / f"opencellid_{self.country.lower()}.csv.gz")

        if not self.data_store.file_exists(file_path):
            raise FileNotFoundError(
                f"OpenCellID data for {self.country} not found at {file_path}. "
                "Download the data first using OpenCellIDDownloader."
            )

        return read_dataset(self.data_store, file_path)

    def to_geodataframe(self) -> gpd.GeoDataFrame:
        """Convert OpenCellID data to a GeoDataFrame"""
        df = self.read_data()
        gdf = gpd.GeoDataFrame(
            df, geometry=gpd.points_from_xy(df.lon, df.lat), crs="EPSG:4326"
        )
        return gdf
read_data()

Read OpenCellID data for the specified country

Source code in gigaspatial/handlers/opencellid.py
def read_data(self) -> pd.DataFrame:
    """Read OpenCellID data for the specified country"""
    file_path = str(self.base_path / f"opencellid_{self.country.lower()}.csv.gz")

    if not self.data_store.file_exists(file_path):
        raise FileNotFoundError(
            f"OpenCellID data for {self.country} not found at {file_path}. "
            "Download the data first using OpenCellIDDownloader."
        )

    return read_dataset(self.data_store, file_path)
to_geodataframe()

Convert OpenCellID data to a GeoDataFrame

Source code in gigaspatial/handlers/opencellid.py
def to_geodataframe(self) -> gpd.GeoDataFrame:
    """Convert OpenCellID data to a GeoDataFrame"""
    df = self.read_data()
    gdf = gpd.GeoDataFrame(
        df, geometry=gpd.points_from_xy(df.lon, df.lat), crs="EPSG:4326"
    )
    return gdf

osm

OSMLocationFetcher

A class to fetch and process location data from OpenStreetMap using the Overpass API.

This class supports fetching various OSM location types including amenities, buildings, shops, and other POI categories.

Source code in gigaspatial/handlers/osm.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
@dataclass
class OSMLocationFetcher:
    """
    A class to fetch and process location data from OpenStreetMap using the Overpass API.

    This class supports fetching various OSM location types including amenities, buildings,
    shops, and other POI categories.
    """

    country: Optional[str] = None
    admin_level: Optional[int] = None
    admin_value: Optional[str] = None
    location_types: Union[List[str], Dict[str, List[str]]] = Field(...)
    base_url: str = "http://overpass-api.de/api/interpreter"
    timeout: int = 600
    max_retries: int = 3
    retry_delay: int = 5

    def __post_init__(self):
        """Validate inputs, normalize location_types, and set up logging."""

        # Normalize location_types to always be a dictionary
        if isinstance(self.location_types, list):
            self.location_types = {"amenity": self.location_types}
        elif not isinstance(self.location_types, dict):
            raise TypeError(
                "location_types must be a list of strings or a dictionary mapping categories to type lists"
            )

        self.logger = config.get_logger(self.__class__.__name__)

        # Validate area selection
        if self.admin_level is not None and self.admin_value is not None:
            self.area_query = f'area["admin_level"={self.admin_level}]["name"="{self.admin_value}"]->.searchArea;'
            self.logger.info(
                f"Using admin_level={self.admin_level}, name={self.admin_value} for area selection."
            )
        elif self.country is not None:
            try:
                self.country = pycountry.countries.lookup(self.country).alpha_2
            except LookupError:
                raise ValueError(f"Invalid country code provided: {self.country}")
            self.area_query = f'area["ISO3166-1"={self.country}]->.searchArea;'
            self.logger.info(f"Using country={self.country} for area selection.")
        else:
            raise ValueError(
                "Either country or both admin_level and admin_value must be provided."
            )

    @staticmethod
    def get_admin_names(
        admin_level: int, country: Optional[str] = None, timeout: int = 120
    ) -> List[str]:
        """
        Fetch all admin area names for a given admin_level (optionally within a country).

        Args:
            admin_level (int): The OSM admin_level to search for (e.g., 4 for states, 6 for counties).
            country (str, optional): Country name or ISO code to filter within.
            timeout (int): Timeout for the Overpass API request.

        Returns:
            List[str]: List of admin area names.
        """

        # Build area filter for country if provided
        if country:
            try:
                country_code = pycountry.countries.lookup(country).alpha_2
            except LookupError:
                raise ValueError(f"Invalid country code or name: {country}")
            area_filter = f'area["ISO3166-1"="{country_code}"]->.countryArea;'
            area_ref = "(area.countryArea)"
        else:
            area_filter = ""
            area_ref = ""

        # Overpass QL to get all admin areas at the specified level
        query = f"""
        [out:json][timeout:{timeout}];
        {area_filter}
        (
          relation["admin_level"="{admin_level}"]{area_ref};
        );
        out tags;
        """

        url = "http://overpass-api.de/api/interpreter"
        response = requests.get(url, params={"data": query}, timeout=timeout)
        response.raise_for_status()
        data = response.json()

        names = []
        for el in data.get("elements", []):
            tags = el.get("tags", {})
            name = tags.get("name")
            if name:
                names.append(name)
        return sorted(set(names))

    @staticmethod
    def get_osm_countries(
        iso3_code: Optional[str] = None, include_names: bool = True, timeout: int = 1000
    ) -> Union[str, Dict[str, str], List[str], List[Dict[str, str]]]:
        """
        Fetch countries from OpenStreetMap database.

        This queries the actual OSM database for country boundaries and returns
        country names as they appear in OSM, including various name translations.

        Args:
            iso3_code (str, optional): ISO 3166-1 alpha-3 code to fetch a specific country.
                                      If provided, returns single country data.
                                      If None, returns all countries.
            include_names (bool): If True, return dict with multiple name variants.
                                 If False, return only the primary name.
            timeout (int): Timeout for the Overpass API request (default: 1000).

        Returns:
            When iso3_code is provided:
                - If include_names=False: Single country name (str)
                - If include_names=True: Dict with name variants
            When iso3_code is None:
                - If include_names=False: List of country names
                - If include_names=True: List of dicts with name variants including:
                  name, name:en, ISO3166-1 codes, and other name translations

        Raises:
            ValueError: If iso3_code is provided but country not found in OSM.
        """
        if iso3_code:
            # Filter for the specific ISO3 code provided
            iso3_upper = iso3_code.upper()
            country_filter = f'["ISO3166-1:alpha3"="{iso3_upper}"]'
        else:
            # Filter for the *existence* of an ISO3 code tag to limit results to actual countries
            country_filter = '["ISO3166-1:alpha3"]'

        # Query OSM for country-level boundaries
        query = f"""
        [out:json][timeout:{timeout}];
        (
          relation["boundary"="administrative"]["admin_level"="2"]{country_filter};
        );
        out tags;
        """

        url = "http://overpass-api.de/api/interpreter"
        response = requests.get(url, params={"data": query}, timeout=timeout)
        response.raise_for_status()
        data = response.json()

        countries = []
        for element in data.get("elements", []):
            tags = element.get("tags", {})

            if include_names:
                country_info = {
                    "name": tags.get("name", ""),
                    "name:en": tags.get("name:en", ""),
                    "official_name": tags.get("official_name", ""),
                    "official_name:en": tags.get("official_name:en", ""),
                    "ISO3166-1": tags.get("ISO3166-1", ""),
                    "ISO3166-1:alpha2": tags.get("ISO3166-1:alpha2", ""),
                    "ISO3166-1:alpha3": tags.get("ISO3166-1:alpha3", ""),
                }

                # Add any other name:* tags (translations)
                for key, value in tags.items():
                    if key.startswith("name:") and key not in country_info:
                        country_info[key] = value

                # Remove empty string values
                country_info = {k: v for k, v in country_info.items() if v}

                if country_info.get("name"):  # Only add if has a name
                    countries.append(country_info)
            else:
                name = tags.get("name")
                if name:
                    countries.append(name)

        # If looking for a specific country, return single result or raise error
        if iso3_code:
            if not countries:
                raise ValueError(
                    f"Country with ISO3 code '{iso3_code}' not found in OSM database"
                )
            return countries[0]  # Return single country, not a list

        # Return sorted list for all countries
        return sorted(
            countries, key=lambda x: x if isinstance(x, str) else x.get("name", "")
        )

    def _make_request(self, query: str) -> Dict:
        """Make HTTP request to Overpass API with retry mechanism."""
        for attempt in range(self.max_retries):
            try:
                self.logger.debug(f"Executing query:\n{query}")
                response = requests.get(
                    self.base_url, params={"data": query}, timeout=self.timeout
                )
                response.raise_for_status()
                return response.json()
            except RequestException as e:
                self.logger.warning(f"Attempt {attempt + 1} failed: {str(e)}")
                if attempt < self.max_retries - 1:
                    sleep(self.retry_delay)
                else:
                    raise RuntimeError(
                        f"Failed to fetch data after {self.max_retries} attempts"
                    ) from e

    def _extract_matching_categories(self, tags: Dict[str, str]) -> Dict[str, str]:
        """
        Extract all matching categories and their values from the tags.
        Returns:
            Dict mapping each matching category to its value
        """
        matches = {}
        for category, types in self.location_types.items():
            if category in tags and tags[category] in types:
                matches[category] = tags[category]
        return matches

    def _process_node_relation(self, element: Dict) -> List[Dict[str, any]]:
        """
        Process a node or relation element.
        May return multiple processed elements if the element matches multiple categories.
        """
        try:
            tags = element.get("tags", {})
            matching_categories = self._extract_matching_categories(tags)

            if not matching_categories:
                self.logger.warning(
                    f"Element {element['id']} missing or not matching specified category tags"
                )
                return []

            _lat = element.get("lat") or element["center"]["lat"]
            _lon = element.get("lon") or element["center"]["lon"]
            point_geom = Point(_lon, _lat)

            # Extract metadata if available
            metadata = {}
            if "timestamp" in element:
                metadata["timestamp"] = element["timestamp"]
                metadata["version"] = element.get("version")
                metadata["changeset"] = element.get("changeset")
                metadata["user"] = element.get("user")
                metadata["uid"] = element.get("uid")

            # For each matching category, create a separate element
            results = []
            for category, value in matching_categories.items():
                result = {
                    "source_id": element["id"],
                    "category": category,
                    "category_value": value,
                    "name": tags.get("name", ""),
                    "name_en": tags.get("name:en", ""),
                    "type": element["type"],
                    "geometry": point_geom,
                    "latitude": _lat,
                    "longitude": _lon,
                    "matching_categories": list(matching_categories.keys()),
                }
                # Add metadata if available
                result.update(metadata)
                results.append(result)

            return results

        except KeyError as e:
            self.logger.error(f"Corrupt data received for node/relation: {str(e)}")
            return []

    def _process_way(self, element: Dict) -> List[Dict[str, any]]:
        """
        Process a way element with geometry.
        May return multiple processed elements if the element matches multiple categories.
        """
        try:
            tags = element.get("tags", {})
            matching_categories = self._extract_matching_categories(tags)

            if not matching_categories:
                self.logger.warning(
                    f"Element {element['id']} missing or not matching specified category tags"
                )
                return []

            # Create polygon from geometry points
            polygon = Polygon([(p["lon"], p["lat"]) for p in element["geometry"]])
            centroid = polygon.centroid

            # Extract metadata if available
            metadata = {}
            if "timestamp" in element:
                metadata["timestamp"] = element["timestamp"]
                metadata["version"] = element.get("version")
                metadata["changeset"] = element.get("changeset")
                metadata["user"] = element.get("user")
                metadata["uid"] = element.get("uid")

            # For each matching category, create a separate element
            results = []
            for category, value in matching_categories.items():
                result = {
                    "source_id": element["id"],
                    "category": category,
                    "category_value": value,
                    "name": tags.get("name", ""),
                    "name_en": tags.get("name:en", ""),
                    "type": element["type"],
                    "geometry": polygon,
                    "latitude": centroid.y,
                    "longitude": centroid.x,
                    "matching_categories": list(matching_categories.keys()),
                }
                # Add metadata if available
                result.update(metadata)
                results.append(result)

            return results
        except (KeyError, ValueError) as e:
            self.logger.error(f"Error processing way geometry: {str(e)}")
            return []

    def _build_queries(
        self,
        date_filter_type: Optional[Literal["newer", "changed"]] = None,
        start_date: Optional[str] = None,
        end_date: Optional[str] = None,
        include_metadata: bool = False,
    ) -> List[str]:
        """
        Construct Overpass QL queries with optional date filtering and metadata.

        Args:
            date_filter_type: Type of date filter ('newer' or 'changed')
            start_date: Start date in ISO 8601 format
            end_date: End date in ISO 8601 format (required for 'changed')
            include_metadata: If True, include change metadata (timestamp, version, changeset, user)

        Returns:
            List[str]: List of [nodes_relations_query, ways_query]
        """
        # Build the date filter based on type
        if date_filter_type == "newer" and start_date:
            date_filter = f'(newer:"{start_date}")'
        elif date_filter_type == "changed" and start_date and end_date:
            date_filter = f'(changed:"{start_date}","{end_date}")'
        else:
            date_filter = ""

        # Determine output mode
        output_mode = "center meta" if include_metadata else "center"
        output_mode_geom = "geom meta" if include_metadata else "geom"

        # Query for nodes and relations
        nodes_relations_queries = []
        for category, types in self.location_types.items():
            nodes_relations_queries.extend(
                [
                    f"""node["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);""",
                    f"""relation["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);""",
                ]
            )

        nodes_relations_queries = "\n".join(nodes_relations_queries)

        nodes_relations_query = f"""
        [out:json][timeout:{self.timeout}];
        {self.area_query}
        (
            {nodes_relations_queries}
        );
        out {output_mode};
        """

        # Query for ways
        ways_queries = []
        for category, types in self.location_types.items():
            ways_queries.append(
                f"""way["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);"""
            )

        ways_queries = "\n".join(ways_queries)

        ways_query = f"""
        [out:json][timeout:{self.timeout}];
        {self.area_query}
        (
            {ways_queries}
        );
        out {output_mode_geom};
        """

        return [nodes_relations_query, ways_query]

    def fetch_locations(
        self,
        since_date: Optional[Union[str, datetime]] = None,
        handle_duplicates: Literal["separate", "combine", "primary"] = "separate",
        include_metadata: bool = False,
    ) -> pd.DataFrame:
        """
        Fetch OSM locations, optionally filtered by 'since' date.

        Use this for incremental updates or getting all current locations.

        Args:
            since_year (int, optional): Filter for locations added/modified since this year.
            handle_duplicates (str): How to handle objects matching multiple categories:
                - 'separate': Create separate entries for each category (default)
                - 'combine': Use a single entry with a list of matching categories
                - 'primary': Keep only the first matching category
            include_metadata: If True, include change tracking metadata
                (timestamp, version, changeset, user, uid)

        Returns:
            pd.DataFrame: Processed OSM locations
        """
        if handle_duplicates not in ("separate", "combine", "primary"):
            raise ValueError(
                "handle_duplicates must be one of: 'separate', 'combine', 'primary'"
            )

        self.logger.info(
            f"Fetching OSM locations from Overpass API for country: {self.country}"
        )
        self.logger.info(f"Location types: {self.location_types}")

        # Normalize date if provided
        since_str = self._normalize_date(since_date) if since_date else None

        if since_str:
            self.logger.info(f"Filtering for changes since: {since_str}")

        queries = self._build_queries(
            date_filter_type="newer" if since_str else None,
            start_date=since_str,
            include_metadata=include_metadata,
        )

        return self._execute_and_process_queries(queries, handle_duplicates)

    def fetch_locations_changed_between(
        self,
        start_date: Union[str, datetime],
        end_date: Union[str, datetime],
        handle_duplicates: Literal["separate", "combine", "primary"] = "separate",
        include_metadata: bool = True,
    ) -> pd.DataFrame:
        """
        Fetch OSM locations that changed within a specific date range.

        Use this for historical analysis or tracking changes in a specific period.

        Args:
            start_date: Start date/time in ISO 8601 format (str: "YYYY-MM-DDThh:mm:ssZ")
                    or datetime object. Changes after this date will be included.
            end_date: End date/time in ISO 8601 format (str: "YYYY-MM-DDThh:mm:ssZ")
                    or datetime object. Changes before this date will be included.
            handle_duplicates: How to handle objects matching multiple categories:
                - 'separate': Create separate entries for each category (default)
                - 'combine': Use a single entry with a list of matching categories
                - 'primary': Keep only the first matching category
            include_metadata: If True, include change tracking metadata
                (timestamp, version, changeset, user, uid)
                Defaults to True since change tracking is the main use case.

        Returns:
            pd.DataFrame: Processed OSM locations that changed within the date range

        Raises:
            ValueError: If dates are invalid or start_date is after end_date
        """
        start_str = self._normalize_date(start_date)
        end_str = self._normalize_date(end_date)

        if start_str >= end_str:
            raise ValueError(
                f"start_date must be before end_date (got {start_str} >= {end_str})"
            )

        queries = self._build_queries(
            date_filter_type="changed",
            start_date=start_str,
            end_date=end_str,
            include_metadata=include_metadata,
        )

        return self._execute_and_process_queries(queries, handle_duplicates)

    def _normalize_date(self, date_input: Union[str, datetime]) -> str:
        """
        Convert date input to ISO 8601 format string.

        Args:
            date_input: Either a string in ISO 8601 format or a datetime object

        Returns:
            str: Date in format "YYYY-MM-DDThh:mm:ssZ"

        Raises:
            ValueError: If string format is invalid
        """
        from datetime import datetime

        if isinstance(date_input, datetime):
            # Convert datetime to ISO 8601 string with Z (UTC) timezone
            return date_input.strftime("%Y-%m-%dT%H:%M:%SZ")

        elif isinstance(date_input, str):
            # Validate the string format
            try:
                # Try to parse it to ensure it's valid
                datetime.strptime(date_input, "%Y-%m-%dT%H:%M:%SZ")
                return date_input
            except ValueError:
                raise ValueError(
                    f"Invalid date format: '{date_input}'. "
                    "Expected format: 'YYYY-MM-DDThh:mm:ssZ' (e.g., '2024-03-15T14:30:00Z')"
                )
        else:
            raise TypeError(
                f"date_input must be str or datetime, got {type(date_input).__name__}"
            )

    def _execute_and_process_queries(
        self, queries: List[str], handle_duplicates: str
    ) -> pd.DataFrame:
        """
        Execute queries and process results (extracted from fetch_locations).

        Args:
            queries: List of [nodes_relations_query, ways_query]
            handle_duplicates: Strategy for handling duplicate categories

        Returns:
            pd.DataFrame: Processed locations
        """
        nodes_relations_query, ways_query = queries

        # Fetch nodes and relations
        nodes_relations_response = self._make_request(nodes_relations_query)
        nodes_relations = nodes_relations_response.get("elements", [])

        # Fetch ways
        ways_response = self._make_request(ways_query)
        ways = ways_response.get("elements", [])

        if not nodes_relations and not ways:
            self.logger.warning("No locations found for the specified criteria")
            return pd.DataFrame()

        self.logger.info(
            f"Processing {len(nodes_relations)} nodes/relations and {len(ways)} ways..."
        )

        # Process nodes and relations
        with ThreadPoolExecutor() as executor:
            processed_nodes_relations = [
                item
                for sublist in executor.map(
                    self._process_node_relation, nodes_relations
                )
                for item in sublist
            ]

        # Process ways
        with ThreadPoolExecutor() as executor:
            processed_ways = [
                item
                for sublist in executor.map(self._process_way, ways)
                for item in sublist
            ]

        # Combine all processed elements
        all_elements = processed_nodes_relations + processed_ways

        if not all_elements:
            self.logger.warning("No matching elements found after processing")
            return pd.DataFrame()

        # Handle duplicates (reuse existing logic from fetch_locations)
        if handle_duplicates != "separate":
            grouped_elements = {}
            for elem in all_elements:
                source_id = elem["source_id"]
                if source_id not in grouped_elements:
                    grouped_elements[source_id] = elem
                elif handle_duplicates == "combine":
                    if grouped_elements[source_id]["category"] != elem["category"]:
                        if isinstance(grouped_elements[source_id]["category"], str):
                            grouped_elements[source_id]["category"] = [
                                grouped_elements[source_id]["category"]
                            ]
                            grouped_elements[source_id]["category_value"] = [
                                grouped_elements[source_id]["category_value"]
                            ]

                        if (
                            elem["category"]
                            not in grouped_elements[source_id]["category"]
                        ):
                            grouped_elements[source_id]["category"].append(
                                elem["category"]
                            )
                            grouped_elements[source_id]["category_value"].append(
                                elem["category_value"]
                            )

            all_elements = list(grouped_elements.values())

        locations = pd.DataFrame(all_elements)

        # Log statistics
        type_counts = locations["type"].value_counts()
        self.logger.info("\nElement type distribution:")
        for element_type, count in type_counts.items():
            self.logger.info(f"{element_type}: {count}")

        self.logger.info(f"Successfully processed {len(locations)} locations")
        return locations
__post_init__()

Validate inputs, normalize location_types, and set up logging.

Source code in gigaspatial/handlers/osm.py
def __post_init__(self):
    """Validate inputs, normalize location_types, and set up logging."""

    # Normalize location_types to always be a dictionary
    if isinstance(self.location_types, list):
        self.location_types = {"amenity": self.location_types}
    elif not isinstance(self.location_types, dict):
        raise TypeError(
            "location_types must be a list of strings or a dictionary mapping categories to type lists"
        )

    self.logger = config.get_logger(self.__class__.__name__)

    # Validate area selection
    if self.admin_level is not None and self.admin_value is not None:
        self.area_query = f'area["admin_level"={self.admin_level}]["name"="{self.admin_value}"]->.searchArea;'
        self.logger.info(
            f"Using admin_level={self.admin_level}, name={self.admin_value} for area selection."
        )
    elif self.country is not None:
        try:
            self.country = pycountry.countries.lookup(self.country).alpha_2
        except LookupError:
            raise ValueError(f"Invalid country code provided: {self.country}")
        self.area_query = f'area["ISO3166-1"={self.country}]->.searchArea;'
        self.logger.info(f"Using country={self.country} for area selection.")
    else:
        raise ValueError(
            "Either country or both admin_level and admin_value must be provided."
        )
fetch_locations(since_date=None, handle_duplicates='separate', include_metadata=False)

Fetch OSM locations, optionally filtered by 'since' date.

Use this for incremental updates or getting all current locations.

Parameters:

Name Type Description Default
since_year int

Filter for locations added/modified since this year.

required
handle_duplicates str

How to handle objects matching multiple categories: - 'separate': Create separate entries for each category (default) - 'combine': Use a single entry with a list of matching categories - 'primary': Keep only the first matching category

'separate'
include_metadata bool

If True, include change tracking metadata (timestamp, version, changeset, user, uid)

False

Returns:

Type Description
DataFrame

pd.DataFrame: Processed OSM locations

Source code in gigaspatial/handlers/osm.py
def fetch_locations(
    self,
    since_date: Optional[Union[str, datetime]] = None,
    handle_duplicates: Literal["separate", "combine", "primary"] = "separate",
    include_metadata: bool = False,
) -> pd.DataFrame:
    """
    Fetch OSM locations, optionally filtered by 'since' date.

    Use this for incremental updates or getting all current locations.

    Args:
        since_year (int, optional): Filter for locations added/modified since this year.
        handle_duplicates (str): How to handle objects matching multiple categories:
            - 'separate': Create separate entries for each category (default)
            - 'combine': Use a single entry with a list of matching categories
            - 'primary': Keep only the first matching category
        include_metadata: If True, include change tracking metadata
            (timestamp, version, changeset, user, uid)

    Returns:
        pd.DataFrame: Processed OSM locations
    """
    if handle_duplicates not in ("separate", "combine", "primary"):
        raise ValueError(
            "handle_duplicates must be one of: 'separate', 'combine', 'primary'"
        )

    self.logger.info(
        f"Fetching OSM locations from Overpass API for country: {self.country}"
    )
    self.logger.info(f"Location types: {self.location_types}")

    # Normalize date if provided
    since_str = self._normalize_date(since_date) if since_date else None

    if since_str:
        self.logger.info(f"Filtering for changes since: {since_str}")

    queries = self._build_queries(
        date_filter_type="newer" if since_str else None,
        start_date=since_str,
        include_metadata=include_metadata,
    )

    return self._execute_and_process_queries(queries, handle_duplicates)
fetch_locations_changed_between(start_date, end_date, handle_duplicates='separate', include_metadata=True)

Fetch OSM locations that changed within a specific date range.

Use this for historical analysis or tracking changes in a specific period.

Parameters:

Name Type Description Default
start_date Union[str, datetime]

Start date/time in ISO 8601 format (str: "YYYY-MM-DDThh:mm:ssZ") or datetime object. Changes after this date will be included.

required
end_date Union[str, datetime]

End date/time in ISO 8601 format (str: "YYYY-MM-DDThh:mm:ssZ") or datetime object. Changes before this date will be included.

required
handle_duplicates Literal['separate', 'combine', 'primary']

How to handle objects matching multiple categories: - 'separate': Create separate entries for each category (default) - 'combine': Use a single entry with a list of matching categories - 'primary': Keep only the first matching category

'separate'
include_metadata bool

If True, include change tracking metadata (timestamp, version, changeset, user, uid) Defaults to True since change tracking is the main use case.

True

Returns:

Type Description
DataFrame

pd.DataFrame: Processed OSM locations that changed within the date range

Raises:

Type Description
ValueError

If dates are invalid or start_date is after end_date

Source code in gigaspatial/handlers/osm.py
def fetch_locations_changed_between(
    self,
    start_date: Union[str, datetime],
    end_date: Union[str, datetime],
    handle_duplicates: Literal["separate", "combine", "primary"] = "separate",
    include_metadata: bool = True,
) -> pd.DataFrame:
    """
    Fetch OSM locations that changed within a specific date range.

    Use this for historical analysis or tracking changes in a specific period.

    Args:
        start_date: Start date/time in ISO 8601 format (str: "YYYY-MM-DDThh:mm:ssZ")
                or datetime object. Changes after this date will be included.
        end_date: End date/time in ISO 8601 format (str: "YYYY-MM-DDThh:mm:ssZ")
                or datetime object. Changes before this date will be included.
        handle_duplicates: How to handle objects matching multiple categories:
            - 'separate': Create separate entries for each category (default)
            - 'combine': Use a single entry with a list of matching categories
            - 'primary': Keep only the first matching category
        include_metadata: If True, include change tracking metadata
            (timestamp, version, changeset, user, uid)
            Defaults to True since change tracking is the main use case.

    Returns:
        pd.DataFrame: Processed OSM locations that changed within the date range

    Raises:
        ValueError: If dates are invalid or start_date is after end_date
    """
    start_str = self._normalize_date(start_date)
    end_str = self._normalize_date(end_date)

    if start_str >= end_str:
        raise ValueError(
            f"start_date must be before end_date (got {start_str} >= {end_str})"
        )

    queries = self._build_queries(
        date_filter_type="changed",
        start_date=start_str,
        end_date=end_str,
        include_metadata=include_metadata,
    )

    return self._execute_and_process_queries(queries, handle_duplicates)
get_admin_names(admin_level, country=None, timeout=120) staticmethod

Fetch all admin area names for a given admin_level (optionally within a country).

Parameters:

Name Type Description Default
admin_level int

The OSM admin_level to search for (e.g., 4 for states, 6 for counties).

required
country str

Country name or ISO code to filter within.

None
timeout int

Timeout for the Overpass API request.

120

Returns:

Type Description
List[str]

List[str]: List of admin area names.

Source code in gigaspatial/handlers/osm.py
@staticmethod
def get_admin_names(
    admin_level: int, country: Optional[str] = None, timeout: int = 120
) -> List[str]:
    """
    Fetch all admin area names for a given admin_level (optionally within a country).

    Args:
        admin_level (int): The OSM admin_level to search for (e.g., 4 for states, 6 for counties).
        country (str, optional): Country name or ISO code to filter within.
        timeout (int): Timeout for the Overpass API request.

    Returns:
        List[str]: List of admin area names.
    """

    # Build area filter for country if provided
    if country:
        try:
            country_code = pycountry.countries.lookup(country).alpha_2
        except LookupError:
            raise ValueError(f"Invalid country code or name: {country}")
        area_filter = f'area["ISO3166-1"="{country_code}"]->.countryArea;'
        area_ref = "(area.countryArea)"
    else:
        area_filter = ""
        area_ref = ""

    # Overpass QL to get all admin areas at the specified level
    query = f"""
    [out:json][timeout:{timeout}];
    {area_filter}
    (
      relation["admin_level"="{admin_level}"]{area_ref};
    );
    out tags;
    """

    url = "http://overpass-api.de/api/interpreter"
    response = requests.get(url, params={"data": query}, timeout=timeout)
    response.raise_for_status()
    data = response.json()

    names = []
    for el in data.get("elements", []):
        tags = el.get("tags", {})
        name = tags.get("name")
        if name:
            names.append(name)
    return sorted(set(names))
get_osm_countries(iso3_code=None, include_names=True, timeout=1000) staticmethod

Fetch countries from OpenStreetMap database.

This queries the actual OSM database for country boundaries and returns country names as they appear in OSM, including various name translations.

Parameters:

Name Type Description Default
iso3_code str

ISO 3166-1 alpha-3 code to fetch a specific country. If provided, returns single country data. If None, returns all countries.

None
include_names bool

If True, return dict with multiple name variants. If False, return only the primary name.

True
timeout int

Timeout for the Overpass API request (default: 1000).

1000

Returns:

Type Description
Union[str, Dict[str, str], List[str], List[Dict[str, str]]]

When iso3_code is provided: - If include_names=False: Single country name (str) - If include_names=True: Dict with name variants

Union[str, Dict[str, str], List[str], List[Dict[str, str]]]

When iso3_code is None: - If include_names=False: List of country names - If include_names=True: List of dicts with name variants including: name, name:en, ISO3166-1 codes, and other name translations

Raises:

Type Description
ValueError

If iso3_code is provided but country not found in OSM.

Source code in gigaspatial/handlers/osm.py
@staticmethod
def get_osm_countries(
    iso3_code: Optional[str] = None, include_names: bool = True, timeout: int = 1000
) -> Union[str, Dict[str, str], List[str], List[Dict[str, str]]]:
    """
    Fetch countries from OpenStreetMap database.

    This queries the actual OSM database for country boundaries and returns
    country names as they appear in OSM, including various name translations.

    Args:
        iso3_code (str, optional): ISO 3166-1 alpha-3 code to fetch a specific country.
                                  If provided, returns single country data.
                                  If None, returns all countries.
        include_names (bool): If True, return dict with multiple name variants.
                             If False, return only the primary name.
        timeout (int): Timeout for the Overpass API request (default: 1000).

    Returns:
        When iso3_code is provided:
            - If include_names=False: Single country name (str)
            - If include_names=True: Dict with name variants
        When iso3_code is None:
            - If include_names=False: List of country names
            - If include_names=True: List of dicts with name variants including:
              name, name:en, ISO3166-1 codes, and other name translations

    Raises:
        ValueError: If iso3_code is provided but country not found in OSM.
    """
    if iso3_code:
        # Filter for the specific ISO3 code provided
        iso3_upper = iso3_code.upper()
        country_filter = f'["ISO3166-1:alpha3"="{iso3_upper}"]'
    else:
        # Filter for the *existence* of an ISO3 code tag to limit results to actual countries
        country_filter = '["ISO3166-1:alpha3"]'

    # Query OSM for country-level boundaries
    query = f"""
    [out:json][timeout:{timeout}];
    (
      relation["boundary"="administrative"]["admin_level"="2"]{country_filter};
    );
    out tags;
    """

    url = "http://overpass-api.de/api/interpreter"
    response = requests.get(url, params={"data": query}, timeout=timeout)
    response.raise_for_status()
    data = response.json()

    countries = []
    for element in data.get("elements", []):
        tags = element.get("tags", {})

        if include_names:
            country_info = {
                "name": tags.get("name", ""),
                "name:en": tags.get("name:en", ""),
                "official_name": tags.get("official_name", ""),
                "official_name:en": tags.get("official_name:en", ""),
                "ISO3166-1": tags.get("ISO3166-1", ""),
                "ISO3166-1:alpha2": tags.get("ISO3166-1:alpha2", ""),
                "ISO3166-1:alpha3": tags.get("ISO3166-1:alpha3", ""),
            }

            # Add any other name:* tags (translations)
            for key, value in tags.items():
                if key.startswith("name:") and key not in country_info:
                    country_info[key] = value

            # Remove empty string values
            country_info = {k: v for k, v in country_info.items() if v}

            if country_info.get("name"):  # Only add if has a name
                countries.append(country_info)
        else:
            name = tags.get("name")
            if name:
                countries.append(name)

    # If looking for a specific country, return single result or raise error
    if iso3_code:
        if not countries:
            raise ValueError(
                f"Country with ISO3 code '{iso3_code}' not found in OSM database"
            )
        return countries[0]  # Return single country, not a list

    # Return sorted list for all countries
    return sorted(
        countries, key=lambda x: x if isinstance(x, str) else x.get("name", "")
    )

overture

OvertureAmenityFetcher

A class to fetch and process amenity locations from Overture.

Source code in gigaspatial/handlers/overture.py
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class OvertureAmenityFetcher:
    """
    A class to fetch and process amenity locations from Overture.
    """

    # constants
    release: Optional[str] = "2024-12-18.0"
    base_url: Optional[str] = (
        "s3://overturemaps-us-west-2/release/{release}/theme=places/*/*"
    )

    # user config
    country: str = Field(...)
    amenity_types: List[str] = Field(..., description="List of amenity types to fetch")
    geom: Union[Polygon, MultiPolygon] = None

    # config for country boundary access from data storage
    # if None GADM boundaries will be used
    data_store: DataStore = None
    country_geom_path: Optional[Union[str, Path]] = None

    def __post_init__(self):
        """Validate inputs and set up logging."""
        try:
            self.country = pycountry.countries.lookup(self.country).alpha_2
        except LookupError:
            raise ValueError(f"Invalid country code provided: {self.country}")

        self.base_url = self.base_url.format(release=self.release)
        self.logger = config.get_logger(self.__class__.__name__)

        self.connection = self._set_connection()

    def _set_connection(self):
        """Set the connection to the DB"""
        db = duckdb.connect()
        db.install_extension("spatial")
        db.load_extension("spatial")
        return db

    def _load_country_geometry(
        self,
    ) -> Union[Polygon, MultiPolygon]:
        """Load country boundary geometry from DataStore or GADM."""

        gdf_admin0 = AdminBoundaries.create(
            country_code=pycountry.countries.lookup(self.country).alpha_3,
            admin_level=0,
            data_store=self.data_store,
            path=self.country_geom_path,
        ).to_geodataframe()

        return gdf_admin0.geometry.iloc[0]

    def _build_query(self, match_pattern: bool = False, **kwargs) -> str:
        """Constructs and returns the query"""

        if match_pattern:
            amenity_query = " OR ".join(
                [f"category ilike '%{amenity}%'" for amenity in self.amenity_types]
            )
        else:
            amenity_query = " OR ".join(
                [f"category == '{amenity}'" for amenity in self.amenity_types]
            )

        query = """
        SELECT id,
            names.primary AS name,
            ROUND(confidence,2) as confidence,
            categories.primary AS category,
            ST_AsText(geometry) as geometry,
        FROM read_parquet('s3://overturemaps-us-west-2/release/2024-12-18.0/theme=places/type=place/*',
            hive_partitioning=1)
        WHERE bbox.xmin > {}
            AND bbox.ymin > {} 
            AND bbox.xmax <  {}
            AND bbox.ymax < {}
            AND ({})
        """

        if not self.geom:
            self.geom = self._load_country_geometry()

        return query.format(*self.geom.bounds, amenity_query)

    def fetch_locations(
        self, match_pattern: bool = False, **kwargs
    ) -> gpd.GeoDataFrame:
        """Fetch and process amenity locations."""
        self.logger.info("Fetching amenity locations from Overture DB...")

        query = self._build_query(match_pattern=match_pattern, **kwargs)

        df = self.connection.execute(query).df()

        self.logger.info("Processing geometries")
        gdf = gpd.GeoDataFrame(
            df, geometry=gpd.GeoSeries.from_wkt(df["geometry"]), crs="EPSG:4326"
        )

        # filter by geometry boundary
        s = STRtree(gdf.geometry)
        result = s.query(self.geom, predicate="intersects")

        locations = gdf.iloc[result].reset_index(drop=True)

        self.logger.info(f"Successfully processed {len(locations)} amenity locations")
        return locations
__post_init__()

Validate inputs and set up logging.

Source code in gigaspatial/handlers/overture.py
def __post_init__(self):
    """Validate inputs and set up logging."""
    try:
        self.country = pycountry.countries.lookup(self.country).alpha_2
    except LookupError:
        raise ValueError(f"Invalid country code provided: {self.country}")

    self.base_url = self.base_url.format(release=self.release)
    self.logger = config.get_logger(self.__class__.__name__)

    self.connection = self._set_connection()
fetch_locations(match_pattern=False, **kwargs)

Fetch and process amenity locations.

Source code in gigaspatial/handlers/overture.py
def fetch_locations(
    self, match_pattern: bool = False, **kwargs
) -> gpd.GeoDataFrame:
    """Fetch and process amenity locations."""
    self.logger.info("Fetching amenity locations from Overture DB...")

    query = self._build_query(match_pattern=match_pattern, **kwargs)

    df = self.connection.execute(query).df()

    self.logger.info("Processing geometries")
    gdf = gpd.GeoDataFrame(
        df, geometry=gpd.GeoSeries.from_wkt(df["geometry"]), crs="EPSG:4326"
    )

    # filter by geometry boundary
    s = STRtree(gdf.geometry)
    result = s.query(self.geom, predicate="intersects")

    locations = gdf.iloc[result].reset_index(drop=True)

    self.logger.info(f"Successfully processed {len(locations)} amenity locations")
    return locations

rwi

RWIConfig dataclass

Bases: HDXConfig

Configuration for Relative Wealth Index data access

Source code in gigaspatial/handlers/rwi.py
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class RWIConfig(HDXConfig):
    """Configuration for Relative Wealth Index data access"""

    # Override dataset_name to be fixed for RWI
    dataset_name: Literal["relative-wealth-index"] = Field(
        default="relative-wealth-index"
    )

    # Additional RWI-specific configurations
    country: Optional[str] = Field(
        default=None, description="Country ISO code to filter data for"
    )
    latest_only: bool = Field(
        default=True,
        description="If True, only get the latest resource for each country",
    )

    def __post_init__(self):
        super().__post_init__()

    def get_relevant_data_units(
        self, source: str, force_recompute: bool = False, **kwargs
    ):
        key = self._cache_key(source, **kwargs)
        resources = super().get_relevant_data_units(source, force_recompute, **kwargs)

        if self.latest_only and len(resources) > 1:
            # Find the resource with the latest creation date
            latest_resource = None
            latest_date = None

            for resource in resources:
                created = resource.get("created")
                if created:
                    try:
                        created_dt = datetime.fromisoformat(
                            created.replace("Z", "+00:00")
                        )
                        if latest_date is None or created_dt > latest_date:
                            latest_date = created_dt
                            latest_resource = resource
                    except ValueError:
                        self.logger.warning(
                            f"Could not parse creation date for resource: {created}"
                        )

            if latest_resource:
                resources = [latest_resource]

            # Update the cache to the latest only
            self._unit_cache[key] = resources
            return resources

        return resources

RWIDownloader

Bases: HDXDownloader

Specialized downloader for the Relative Wealth Index dataset from HDX

Source code in gigaspatial/handlers/rwi.py
class RWIDownloader(HDXDownloader):
    """Specialized downloader for the Relative Wealth Index dataset from HDX"""

    def __init__(
        self,
        config: Union[RWIConfig, dict] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        config = config if isinstance(config, RWIConfig) else RWIConfig(**config)
        super().__init__(config=config, data_store=data_store, logger=logger)

RWIHandler

Bases: HDXHandler

Handler for Relative Wealth Index dataset

Source code in gigaspatial/handlers/rwi.py
class RWIHandler(HDXHandler):
    """Handler for Relative Wealth Index dataset"""

    def __init__(
        self,
        config: Optional[RWIConfig] = None,
        downloader: Optional[RWIDownloader] = None,
        reader: Optional[RWIReader] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
        **kwargs,
    ):
        super().__init__(
            dataset_name="relative-wealth-index",
            config=config,
            downloader=downloader,
            reader=reader,
            data_store=data_store,
            logger=logger,
            **kwargs,
        )

    def create_config(
        self, data_store: DataStore, logger: logging.Logger, **kwargs
    ) -> RWIConfig:
        """Create and return a RWIConfig instance"""
        return RWIConfig(
            data_store=data_store,
            logger=logger,
            **kwargs,
        )

    def create_downloader(
        self,
        config: RWIConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> RWIDownloader:
        """Create and return a RWIDownloader instance"""
        return RWIDownloader(
            config=config,
            data_store=data_store,
            logger=logger,
            **kwargs,
        )

    def create_reader(
        self,
        config: RWIConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> RWIReader:
        """Create and return a RWIReader instance"""
        return RWIReader(
            config=config,
            data_store=data_store,
            logger=logger,
            **kwargs,
        )
create_config(data_store, logger, **kwargs)

Create and return a RWIConfig instance

Source code in gigaspatial/handlers/rwi.py
def create_config(
    self, data_store: DataStore, logger: logging.Logger, **kwargs
) -> RWIConfig:
    """Create and return a RWIConfig instance"""
    return RWIConfig(
        data_store=data_store,
        logger=logger,
        **kwargs,
    )
create_downloader(config, data_store, logger, **kwargs)

Create and return a RWIDownloader instance

Source code in gigaspatial/handlers/rwi.py
def create_downloader(
    self,
    config: RWIConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> RWIDownloader:
    """Create and return a RWIDownloader instance"""
    return RWIDownloader(
        config=config,
        data_store=data_store,
        logger=logger,
        **kwargs,
    )
create_reader(config, data_store, logger, **kwargs)

Create and return a RWIReader instance

Source code in gigaspatial/handlers/rwi.py
def create_reader(
    self,
    config: RWIConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> RWIReader:
    """Create and return a RWIReader instance"""
    return RWIReader(
        config=config,
        data_store=data_store,
        logger=logger,
        **kwargs,
    )

RWIReader

Bases: HDXReader

Specialized reader for the Relative Wealth Index dataset from HDX

Source code in gigaspatial/handlers/rwi.py
class RWIReader(HDXReader):
    """Specialized reader for the Relative Wealth Index dataset from HDX"""

    def __init__(
        self,
        config: Union[RWIConfig, dict] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        config = config if isinstance(config, RWIConfig) else RWIConfig(**config)
        super().__init__(config=config, data_store=data_store, logger=logger)

srtm

nasa_srtm

NasaSRTMConfig dataclass

Bases: BaseHandlerConfig

Configuration for NASA SRTM .hgt tiles (30m or 90m). Creates tile geometries dynamically for 1°x1° grid cells.

Each tile file covers 1 degree latitude x 1 degree longitude.

Source code in gigaspatial/handlers/srtm/nasa_srtm.py
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class NasaSRTMConfig(BaseHandlerConfig):
    """
    Configuration for NASA SRTM .hgt tiles (30m or 90m).
    Creates tile geometries dynamically for 1°x1° grid cells.

    Each tile file covers 1 degree latitude x 1 degree longitude.
    """

    earthdata_username: str = Field(
        default=global_config.EARTHDATA_USERNAME, description="Earthdata Login username"
    )
    earthdata_password: str = Field(
        default=global_config.EARTHDATA_PASSWORD, description="Earthdata Login password"
    )

    BASE_URL: str = "https://e4ftl01.cr.usgs.gov/MEASURES/SRTMGL{}.003/2000.02.11/"

    # user config
    base_path: Path = global_config.get_path("nasa_srtm", "bronze")
    resolution: Literal["30m", "90m"] = "30m"

    def __post_init__(self):
        super().__post_init__()
        self._res_arc = 3 if self.resolution == "90m" else 1
        self.BASE_URL = self.BASE_URL.format(self._res_arc)
        # self.session = self._setup_earthdata_session()
        self.session = self._create_authenticated_session()
        self._generate_tile_grid()

    def _create_authenticated_session(self) -> requests.Session:
        """
        Create a persistent Earthdata-authenticated requests session
        that keeps Authorization headers through redirects.
        """
        logging.info("Setting up Earthdata session with header redirection...")

        session = EarthdataSession(
            username=self.earthdata_username,
            password=self.earthdata_password,
        )

        # Optionally verify credentials once (to pre-authenticate cookies)
        auth_test = "https://urs.earthdata.nasa.gov"
        try:
            r = session.get(auth_test, timeout=10)
            logging.debug(f"Earthdata auth test status: {r.status_code}")
        except requests.RequestException as e:
            logging.warning(f"Earthdata auth test failed: {e}")

        return session

    def _generate_tile_grid(self):
        """
        Generate 1°x1° grid polygons covering global extent.
        """

        lats = range(-90, 90)
        lons = range(-180, 180)

        grid_records = []
        for lat, lon in itertools.product(lats, lons):
            tile_name = self._tile_name(lat, lon)
            grid_records.append(
                {
                    "tile_id": tile_name,
                    "geometry": box(lon, lat, lon + 1, lat + 1),
                    "tile_url": f"{self.BASE_URL}/{tile_name}.SRTMGL{self._res_arc}.hgt.zip",
                }
            )

        self.grid_records = grid_records
        self.tile_tree = STRtree([r.get("geometry") for r in grid_records])

    def _tile_name(self, lat: int, lon: int) -> str:
        """Return the SRTM tile name like N37E023 or S10W120."""
        ns = "N" if lat >= 0 else "S"
        ew = "E" if lon >= 0 else "W"
        return f"{ns}{abs(lat):02d}{ew}{abs(lon):03d}"

    def get_relevant_data_units(self, source, force_recompute: bool = False, **kwargs):
        return super().get_relevant_data_units(
            source, force_recompute, crs="EPSG:4326", **kwargs
        )

    def get_relevant_data_units_by_geometry(
        self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
    ) -> List[dict]:
        mask = self.tile_tree.query(geometry, predicate="intersects")
        filtered_grid = [self.grid_records[i] for i in mask]

        return gpd.GeoDataFrame(filtered_grid, crs="EPSG:4326").to_dict("records")

    def get_data_unit_path(self, unit: Union[pd.Series, dict, str], **kwargs) -> Path:
        """
        Given a tile unit or tile_id, return expected storage path.
        """
        tile_id = unit["tile_id"] if isinstance(unit, (pd.Series, dict)) else unit
        return self.base_path / f"{tile_id}.SRTMGL{self._res_arc}.hgt.zip"

    def get_data_unit_paths(
        self, units: Union[pd.DataFrame, Iterable[Union[dict, str]]], **kwargs
    ) -> list:
        """
        Given tile identifiers, return list of file paths.
        """
        if isinstance(units, pd.DataFrame):
            return [
                self.get_data_unit_path(row, **kwargs) for _, row in units.iterrows()
            ]
        return super().get_data_unit_paths(units, **kwargs)
get_data_unit_path(unit, **kwargs)

Given a tile unit or tile_id, return expected storage path.

Source code in gigaspatial/handlers/srtm/nasa_srtm.py
def get_data_unit_path(self, unit: Union[pd.Series, dict, str], **kwargs) -> Path:
    """
    Given a tile unit or tile_id, return expected storage path.
    """
    tile_id = unit["tile_id"] if isinstance(unit, (pd.Series, dict)) else unit
    return self.base_path / f"{tile_id}.SRTMGL{self._res_arc}.hgt.zip"
get_data_unit_paths(units, **kwargs)

Given tile identifiers, return list of file paths.

Source code in gigaspatial/handlers/srtm/nasa_srtm.py
def get_data_unit_paths(
    self, units: Union[pd.DataFrame, Iterable[Union[dict, str]]], **kwargs
) -> list:
    """
    Given tile identifiers, return list of file paths.
    """
    if isinstance(units, pd.DataFrame):
        return [
            self.get_data_unit_path(row, **kwargs) for _, row in units.iterrows()
        ]
    return super().get_data_unit_paths(units, **kwargs)
NasaSRTMDownloader

Bases: BaseHandlerDownloader

A class to handle downloads of NASA SRTM elevation data.

Source code in gigaspatial/handlers/srtm/nasa_srtm.py
class NasaSRTMDownloader(BaseHandlerDownloader):
    """A class to handle downloads of NASA SRTM elevation data."""

    def __init__(
        self,
        config: Optional[NasaSRTMConfig] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        """
        Initialize the downloader.

        Args:
            config: Optional configuration for customizing download behavior and file paths.
                    If None, a default `NasaSRTMConfig` is used.
            data_store: Optional instance of a `DataStore` for managing data storage.
                        If provided, it overrides the `data_store` in the `config`.
                        If None, the `data_store` from the `config` is used.
            logger: Optional custom logger instance. If None, a default logger
                    named after the module is created and used.
        """
        config = config or NasaSRTMConfig()
        super().__init__(config=config, data_store=data_store, logger=logger)

    def download_data_unit(
        self,
        tile_info: Union[pd.Series, dict],
        **kwargs,
    ) -> Optional[str]:
        """Download data file for a single SRTM tile."""

        tile_url = tile_info["tile_url"]

        try:
            response = self.config.session.get(tile_url, stream=True)
            response.raise_for_status()

            file_path = str(self.config.get_data_unit_path(tile_info))

            with self.data_store.open(file_path, "wb") as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)

                self.logger.debug(
                    f"Successfully downloaded tile: {tile_info['tile_id']}"
                )
                return file_path

        except requests.exceptions.RequestException as e:
            self.logger.error(
                f"Failed to download tile {tile_info['tile_id']}: {str(e)}"
            )
            return None
        except Exception as e:
            self.logger.error(f"Unexpected error downloading dataset: {str(e)}")
            return None

    def download_data_units(
        self,
        tiles: Union[pd.DataFrame, List[dict]],
        **kwargs,
    ) -> List[str]:
        """Download data files for multiple SRTM tiles."""

        if len(tiles) == 0:
            self.logger.warning(f"There is no matching data")
            return []

        with multiprocessing.Pool(self.config.n_workers) as pool:
            download_func = functools.partial(self.download_data_unit)
            file_paths = list(
                tqdm(
                    pool.imap(
                        download_func,
                        (
                            [row for _, row in tiles.iterrows()]
                            if isinstance(tiles, pd.DataFrame)
                            else tiles
                        ),
                    ),
                    total=len(tiles),
                    desc=f"Downloading SRTM elevation data",
                )
            )

        return [path for path in file_paths if path is not None]
__init__(config=None, data_store=None, logger=None)

Initialize the downloader.

Parameters:

Name Type Description Default
config Optional[NasaSRTMConfig]

Optional configuration for customizing download behavior and file paths. If None, a default NasaSRTMConfig is used.

None
data_store Optional[DataStore]

Optional instance of a DataStore for managing data storage. If provided, it overrides the data_store in the config. If None, the data_store from the config is used.

None
logger Optional[Logger]

Optional custom logger instance. If None, a default logger named after the module is created and used.

None
Source code in gigaspatial/handlers/srtm/nasa_srtm.py
def __init__(
    self,
    config: Optional[NasaSRTMConfig] = None,
    data_store: Optional[DataStore] = None,
    logger: Optional[logging.Logger] = None,
):
    """
    Initialize the downloader.

    Args:
        config: Optional configuration for customizing download behavior and file paths.
                If None, a default `NasaSRTMConfig` is used.
        data_store: Optional instance of a `DataStore` for managing data storage.
                    If provided, it overrides the `data_store` in the `config`.
                    If None, the `data_store` from the `config` is used.
        logger: Optional custom logger instance. If None, a default logger
                named after the module is created and used.
    """
    config = config or NasaSRTMConfig()
    super().__init__(config=config, data_store=data_store, logger=logger)
download_data_unit(tile_info, **kwargs)

Download data file for a single SRTM tile.

Source code in gigaspatial/handlers/srtm/nasa_srtm.py
def download_data_unit(
    self,
    tile_info: Union[pd.Series, dict],
    **kwargs,
) -> Optional[str]:
    """Download data file for a single SRTM tile."""

    tile_url = tile_info["tile_url"]

    try:
        response = self.config.session.get(tile_url, stream=True)
        response.raise_for_status()

        file_path = str(self.config.get_data_unit_path(tile_info))

        with self.data_store.open(file_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)

            self.logger.debug(
                f"Successfully downloaded tile: {tile_info['tile_id']}"
            )
            return file_path

    except requests.exceptions.RequestException as e:
        self.logger.error(
            f"Failed to download tile {tile_info['tile_id']}: {str(e)}"
        )
        return None
    except Exception as e:
        self.logger.error(f"Unexpected error downloading dataset: {str(e)}")
        return None
download_data_units(tiles, **kwargs)

Download data files for multiple SRTM tiles.

Source code in gigaspatial/handlers/srtm/nasa_srtm.py
def download_data_units(
    self,
    tiles: Union[pd.DataFrame, List[dict]],
    **kwargs,
) -> List[str]:
    """Download data files for multiple SRTM tiles."""

    if len(tiles) == 0:
        self.logger.warning(f"There is no matching data")
        return []

    with multiprocessing.Pool(self.config.n_workers) as pool:
        download_func = functools.partial(self.download_data_unit)
        file_paths = list(
            tqdm(
                pool.imap(
                    download_func,
                    (
                        [row for _, row in tiles.iterrows()]
                        if isinstance(tiles, pd.DataFrame)
                        else tiles
                    ),
                ),
                total=len(tiles),
                desc=f"Downloading SRTM elevation data",
            )
        )

    return [path for path in file_paths if path is not None]
NasaSRTMHandler

Bases: BaseHandler

Main handler class for NASA SRTM elevation data.

Source code in gigaspatial/handlers/srtm/nasa_srtm.py
class NasaSRTMHandler(BaseHandler):
    """Main handler class for NASA SRTM elevation data."""

    def create_config(
        self, data_store: DataStore, logger: logging.Logger, **kwargs
    ) -> NasaSRTMConfig:
        """Create and return a NasaSRTMConfig instance."""
        return NasaSRTMConfig(data_store=data_store, logger=logger, **kwargs)

    def create_downloader(
        self,
        config: NasaSRTMConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> NasaSRTMDownloader:
        """Create and return a NasaSRTMDownloader instance."""
        return NasaSRTMDownloader(
            config=config, data_store=data_store, logger=logger, **kwargs
        )

    def create_reader(
        self,
        config: NasaSRTMConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> NasaSRTMReader:
        """Create and return a NasaSRTMReader instance."""
        return NasaSRTMReader(
            config=config, data_store=data_store, logger=logger, **kwargs
        )
create_config(data_store, logger, **kwargs)

Create and return a NasaSRTMConfig instance.

Source code in gigaspatial/handlers/srtm/nasa_srtm.py
def create_config(
    self, data_store: DataStore, logger: logging.Logger, **kwargs
) -> NasaSRTMConfig:
    """Create and return a NasaSRTMConfig instance."""
    return NasaSRTMConfig(data_store=data_store, logger=logger, **kwargs)
create_downloader(config, data_store, logger, **kwargs)

Create and return a NasaSRTMDownloader instance.

Source code in gigaspatial/handlers/srtm/nasa_srtm.py
def create_downloader(
    self,
    config: NasaSRTMConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> NasaSRTMDownloader:
    """Create and return a NasaSRTMDownloader instance."""
    return NasaSRTMDownloader(
        config=config, data_store=data_store, logger=logger, **kwargs
    )
create_reader(config, data_store, logger, **kwargs)

Create and return a NasaSRTMReader instance.

Source code in gigaspatial/handlers/srtm/nasa_srtm.py
def create_reader(
    self,
    config: NasaSRTMConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> NasaSRTMReader:
    """Create and return a NasaSRTMReader instance."""
    return NasaSRTMReader(
        config=config, data_store=data_store, logger=logger, **kwargs
    )
NasaSRTMReader

Bases: BaseHandlerReader

A class to handle reading of NASA SRTM elevation data.

Source code in gigaspatial/handlers/srtm/nasa_srtm.py
class NasaSRTMReader(BaseHandlerReader):
    """A class to handle reading of NASA SRTM elevation data."""

    def __init__(
        self,
        config: Optional[NasaSRTMConfig] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        """
        Initialize the reader.

        Args:
            config: Optional configuration for customizing reading behavior and file paths.
                    If None, a default `NasaSRTMConfig` is used.
            data_store: Optional instance of a `DataStore` for managing data storage.
                        If provided, it overrides the `data_store` in the `config`.
                        If None, the `data_store` from the `config` is used.
            logger: Optional custom logger instance. If None, a default logger
                    named after the module is created and used.
        """
        config = config or NasaSRTMConfig()
        super().__init__(config=config, data_store=data_store, logger=logger)

    def load_from_paths(
        self, source_data_path: List[Union[str, Path]], **kwargs
    ) -> Union[pd.DataFrame, List[SRTMParser]]:
        """
        Load SRTM elevation data from file paths.

        Args:
            source_data_path: List of SRTM .hgt.zip file paths
            **kwargs: Additional parameters for data loading
                - as_dataframe: bool, default=True. If True, return concatenated DataFrame.
                               If False, return list of SRTMParser objects.
                - dropna: bool, default=True. If True, drop rows with NaN elevation values.

        Returns:
            Union[pd.DataFrame, List[SRTMParser]]: Loaded elevation data
        """
        as_dataframe = kwargs.get("as_dataframe", True)
        dropna = kwargs.get("dropna", True)

        parsers = []
        for file_path in source_data_path:
            try:
                parser = SRTMParser(file_path, data_store=self.data_store)
                parsers.append(parser)
                self.logger.debug(f"Successfully loaded SRTM tile: {file_path}")
            except Exception as e:
                self.logger.error(f"Failed to load SRTM tile {file_path}: {str(e)}")
                continue

        if not parsers:
            self.logger.warning("No SRTM tiles could be loaded")
            return pd.DataFrame() if as_dataframe else []

        if as_dataframe:
            # Concatenate all tile dataframes
            dataframes = [parser.to_dataframe(dropna=dropna) for parser in parsers]
            if dataframes:
                combined_df = pd.concat(dataframes, ignore_index=True)
                self.logger.info(
                    f"Loaded {len(combined_df)} elevation points from {len(parsers)} tiles"
                )
                return combined_df
            else:
                return pd.DataFrame()
        else:
            self.logger.info(f"Loaded {len(parsers)} SRTM tiles")
            return parsers
__init__(config=None, data_store=None, logger=None)

Initialize the reader.

Parameters:

Name Type Description Default
config Optional[NasaSRTMConfig]

Optional configuration for customizing reading behavior and file paths. If None, a default NasaSRTMConfig is used.

None
data_store Optional[DataStore]

Optional instance of a DataStore for managing data storage. If provided, it overrides the data_store in the config. If None, the data_store from the config is used.

None
logger Optional[Logger]

Optional custom logger instance. If None, a default logger named after the module is created and used.

None
Source code in gigaspatial/handlers/srtm/nasa_srtm.py
def __init__(
    self,
    config: Optional[NasaSRTMConfig] = None,
    data_store: Optional[DataStore] = None,
    logger: Optional[logging.Logger] = None,
):
    """
    Initialize the reader.

    Args:
        config: Optional configuration for customizing reading behavior and file paths.
                If None, a default `NasaSRTMConfig` is used.
        data_store: Optional instance of a `DataStore` for managing data storage.
                    If provided, it overrides the `data_store` in the `config`.
                    If None, the `data_store` from the `config` is used.
        logger: Optional custom logger instance. If None, a default logger
                named after the module is created and used.
    """
    config = config or NasaSRTMConfig()
    super().__init__(config=config, data_store=data_store, logger=logger)
load_from_paths(source_data_path, **kwargs)

Load SRTM elevation data from file paths.

Parameters:

Name Type Description Default
source_data_path List[Union[str, Path]]

List of SRTM .hgt.zip file paths

required
**kwargs

Additional parameters for data loading - as_dataframe: bool, default=True. If True, return concatenated DataFrame. If False, return list of SRTMParser objects. - dropna: bool, default=True. If True, drop rows with NaN elevation values.

{}

Returns:

Type Description
Union[DataFrame, List[SRTMParser]]

Union[pd.DataFrame, List[SRTMParser]]: Loaded elevation data

Source code in gigaspatial/handlers/srtm/nasa_srtm.py
def load_from_paths(
    self, source_data_path: List[Union[str, Path]], **kwargs
) -> Union[pd.DataFrame, List[SRTMParser]]:
    """
    Load SRTM elevation data from file paths.

    Args:
        source_data_path: List of SRTM .hgt.zip file paths
        **kwargs: Additional parameters for data loading
            - as_dataframe: bool, default=True. If True, return concatenated DataFrame.
                           If False, return list of SRTMParser objects.
            - dropna: bool, default=True. If True, drop rows with NaN elevation values.

    Returns:
        Union[pd.DataFrame, List[SRTMParser]]: Loaded elevation data
    """
    as_dataframe = kwargs.get("as_dataframe", True)
    dropna = kwargs.get("dropna", True)

    parsers = []
    for file_path in source_data_path:
        try:
            parser = SRTMParser(file_path, data_store=self.data_store)
            parsers.append(parser)
            self.logger.debug(f"Successfully loaded SRTM tile: {file_path}")
        except Exception as e:
            self.logger.error(f"Failed to load SRTM tile {file_path}: {str(e)}")
            continue

    if not parsers:
        self.logger.warning("No SRTM tiles could be loaded")
        return pd.DataFrame() if as_dataframe else []

    if as_dataframe:
        # Concatenate all tile dataframes
        dataframes = [parser.to_dataframe(dropna=dropna) for parser in parsers]
        if dataframes:
            combined_df = pd.concat(dataframes, ignore_index=True)
            self.logger.info(
                f"Loaded {len(combined_df)} elevation points from {len(parsers)} tiles"
            )
            return combined_df
        else:
            return pd.DataFrame()
    else:
        self.logger.info(f"Loaded {len(parsers)} SRTM tiles")
        return parsers

srtm_manager

SRTMManager

Manager for accessing elevation data across multiple SRTM .hgt.zip files.

Implements lazy loading with LRU caching for efficient memory usage. Automatically handles multiple tiles for elevation profiles.

Source code in gigaspatial/handlers/srtm/srtm_manager.py
class SRTMManager:
    """
    Manager for accessing elevation data across multiple SRTM .hgt.zip files.

    Implements lazy loading with LRU caching for efficient memory usage.
    Automatically handles multiple tiles for elevation profiles.
    """

    def __init__(
        self,
        srtm_directory: Union[str, Path],
        downloader: NasaSRTMDownloader = None,
        cache_size: int = 10,
        data_store: Optional[DataStore] = None,
    ):
        """
        Initialize the SRTM Manager.

        Parameters
        ----------
        srtm_directory : str or Path
            Directory containing .hgt.zip files
        downloader : optional
            Downloader instance for auto-downloading missing tiles
        cache_size : int, default=10
            Maximum number of SRTM tiles to keep in memory (LRU cache)
        data_store : DataStore, optional
            Data store for reading files. Priority: provided data_store >
            downloader.data_store > LocalDataStore()
        """
        self.srtm_directory = Path(srtm_directory)
        self.downloader = downloader

        # Set data_store: use provided, otherwise downloader's, otherwise LocalDataStore
        if data_store is not None:
            self.data_store = data_store
        elif downloader is not None and hasattr(downloader, "data_store"):
            self.data_store = downloader.data_store
        else:
            self.data_store = LocalDataStore()

        # Check if directory exists
        if not self.data_store.is_dir(str(self.srtm_directory)):
            raise FileNotFoundError(f"Directory not found: {self.srtm_directory}")

        # Build index of available tiles
        self.tile_index = self._build_tile_index()

        # Set up LRU cache for lazy loading
        self._get_parser_cached = lru_cache(maxsize=cache_size)(self._load_parser)

    def _build_tile_index(self) -> dict:
        """
        Build an index of available SRTM tiles in the directory.

        Returns
        -------
        dict
            Mapping of (lat, lon) tuple to file path
        """
        tile_index = {}

        # Pattern to match SRTM filenames: N00E000 or S00W000
        pattern = re.compile(r"^([NS])(\d{2})([EW])(\d{3})")

        # List files using DataStore
        file_list = self.data_store.list_files(str(self.srtm_directory))

        for file_path_str in file_list:
            if file_path_str.endswith(".hgt.zip"):
                # Extract just the filename for pattern matching
                file_name = Path(file_path_str).name
                file_stem = Path(file_name).stem

                match = pattern.match(file_stem)

                if match:
                    lat_dir, lat_val, lon_dir, lon_val = match.groups()

                    lat = int(lat_val) if lat_dir == "N" else -int(lat_val)
                    lon = int(lon_val) if lon_dir == "E" else -int(lon_val)

                    # Use the path as returned by DataStore (will be used for reading)
                    tile_index[(lat, lon)] = file_path_str

        return tile_index

    def _get_tile_coordinates(
        self, latitude: float, longitude: float
    ) -> Tuple[int, int]:
        """
        Get the tile coordinates (southwest corner) for a given lat/lon.

        Parameters
        ----------
        latitude : float
            Latitude in decimal degrees
        longitude : float
            Longitude in decimal degrees

        Returns
        -------
        tuple of (lat_tile, lon_tile)
            Southwest corner coordinates of the tile
        """
        # SRTM tiles are 1x1 degree, named by their southwest corner
        lat_tile = int(np.floor(latitude))
        lon_tile = int(np.floor(longitude))

        return lat_tile, lon_tile

    def _load_parser(self, lat_tile: int, lon_tile: int):
        """
        Load a SRTMParser for a specific tile (used with LRU cache).

        Parameters
        ----------
        lat_tile : int
            Tile latitude (southwest corner)
        lon_tile : int
            Tile longitude (southwest corner)

        Returns
        -------
        SRTMParser
            Parser instance for the tile
        """
        tile_key = (lat_tile, lon_tile)

        if tile_key not in self.tile_index:
            if self.downloader:
                # Auto-download missing tile
                from shapely.geometry import box

                # Create tile_info following the pattern from NasaSRTMConfig
                tile_id = self.downloader.config._tile_name(lat_tile, lon_tile)
                tile_url = f"{self.downloader.config.BASE_URL}/{tile_id}.SRTMGL{self.downloader.config._res_arc}.hgt.zip"

                tile_info = {
                    "tile_id": tile_id,
                    "geometry": box(lon_tile, lat_tile, lon_tile + 1, lat_tile + 1),
                    "tile_url": tile_url,
                }

                # Use download_data_unit for direct download
                self.downloader.download_data_unit(tile_info)

                # Rebuild index to find new tile
                self.tile_index = self._build_tile_index()

                # Check if tile is now available
                if tile_key not in self.tile_index:
                    raise FileNotFoundError(
                        f"SRTM tile for ({lat_tile}, {lon_tile}) could not be downloaded to {self.srtm_directory}"
                    )
            else:
                raise FileNotFoundError(
                    f"SRTM tile for ({lat_tile}, {lon_tile}) not found in {self.srtm_directory}"
                )

        return SRTMParser(self.tile_index[tile_key], data_store=self.data_store)

    def get_elevation(self, latitude: float, longitude: float) -> float:
        """
        Get interpolated elevation for a specific coordinate.

        Automatically finds and loads the correct SRTM tile.

        Parameters
        ----------
        latitude : float
            Latitude in decimal degrees (-90 to 90)
        longitude : float
            Longitude in decimal degrees (-180 to 180)

        Returns
        -------
        float
            Interpolated elevation in meters

        Raises
        ------
        FileNotFoundError
            If the required SRTM tile is not available
        """
        # Get tile coordinates
        lat_tile, lon_tile = self._get_tile_coordinates(latitude, longitude)

        # Load parser (cached)
        parser = self._get_parser_cached(lat_tile, lon_tile)

        # Get elevation
        return parser.get_elevation(latitude, longitude)

    def get_elevation_batch(self, coordinates: np.ndarray) -> np.ndarray:
        """
        Get elevations for multiple coordinates efficiently.

        Groups coordinates by tile to minimize parser loads.

        Parameters
        ----------
        coordinates : np.ndarray of shape (n, 2)
            Array of (latitude, longitude) pairs

        Returns
        -------
        np.ndarray of shape (n,)
            Elevations in meters

        Raises
        ------
        FileNotFoundError
            If any required SRTM tile is not available
        """
        elevations = np.zeros(len(coordinates))

        # Group coordinates by tile
        tile_groups = {}
        for i, (lat, lon) in enumerate(coordinates):
            tile_key = self._get_tile_coordinates(lat, lon)
            if tile_key not in tile_groups:
                tile_groups[tile_key] = []
            tile_groups[tile_key].append((i, lat, lon))

        # Process each tile group
        for tile_key, coords_list in tile_groups.items():
            parser = self._get_parser_cached(*tile_key)

            # Extract coordinates for this tile
            indices = [c[0] for c in coords_list]
            tile_coords = np.array([[c[1], c[2]] for c in coords_list])

            # Get elevations
            tile_elevations = parser.get_elevation_batch(tile_coords)

            # Store results
            elevations[indices] = tile_elevations

        return elevations

    def get_elevation_profile(
        self,
        start_lat: float,
        start_lon: float,
        end_lat: float,
        end_lon: float,
        num_points: int = 100,
    ) -> pd.DataFrame:
        """
        Get elevation profile between two points.

        Uses linear interpolation between points and automatically handles multiple SRTM tiles.
        For more accurate great circle paths over long distances, consider using geopy.

        Parameters
        ----------
        start_lat : float
            Starting latitude in decimal degrees
        start_lon : float
            Starting longitude in decimal degrees
        end_lat : float
            Ending latitude in decimal degrees
        end_lon : float
            Ending longitude in decimal degrees
        num_points : int, default=100
            Number of sample points along the path

        Returns
        -------
        pd.DataFrame
            DataFrame with columns: distance_km, latitude, longitude, elevation

        Raises
        ------
        FileNotFoundError
            If any required SRTM tile along the path is not available
        """
        # Generate points along the path (linear interpolation)
        lats = np.linspace(start_lat, end_lat, num_points)
        lons = np.linspace(start_lon, end_lon, num_points)

        coordinates = np.column_stack((lats, lons))

        # Get elevations for all points
        elevations = self.get_elevation_batch(coordinates)

        # Calculate distances using Haversine formula
        distances = self._calculate_cumulative_distances(lats, lons)

        # Create DataFrame
        profile = pd.DataFrame(
            {
                "distance_km": distances,
                "latitude": lats,
                "longitude": lons,
                "elevation": elevations,
            }
        )

        return profile

    @staticmethod
    def _calculate_cumulative_distances(
        lats: np.ndarray, lons: np.ndarray
    ) -> np.ndarray:
        """
        Calculate cumulative distances along a path using Haversine formula.

        Parameters
        ----------
        lats : np.ndarray
            Array of latitudes
        lons : np.ndarray
            Array of longitudes

        Returns
        -------
        np.ndarray
            Cumulative distances in kilometers
        """
        R = 6371.0  # Earth radius in km

        distances = np.zeros(len(lats))

        for i in range(1, len(lats)):
            # Haversine formula
            lat1, lon1 = np.radians(lats[i - 1]), np.radians(lons[i - 1])
            lat2, lon2 = np.radians(lats[i]), np.radians(lons[i])

            dlat = lat2 - lat1
            dlon = lon2 - lon1

            a = (
                np.sin(dlat / 2) ** 2
                + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
            )
            c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

            distances[i] = distances[i - 1] + R * c

        return distances

    def get_available_tiles(self) -> List[Tuple[int, int]]:
        """
        Get list of available SRTM tiles.

        Returns
        -------
        list of tuples
            List of (lat, lon) tile coordinates
        """
        return list(self.tile_index.keys())

    def check_coverage(self, latitude: float, longitude: float) -> bool:
        """
        Check if a specific coordinate has SRTM coverage.

        Parameters
        ----------
        latitude : float
            Latitude in decimal degrees
        longitude : float
            Longitude in decimal degrees

        Returns
        -------
        bool
            True if tile is available, False otherwise
        """
        tile_key = self._get_tile_coordinates(latitude, longitude)
        return tile_key in self.tile_index

    def clear_cache(self):
        """Clear the LRU cache of loaded parsers."""
        self._get_parser_cached.cache_clear()

    def get_cache_info(self):
        """
        Get cache statistics.

        Returns
        -------
        CacheInfo
            Named tuple with hits, misses, maxsize, currsize
        """
        return self._get_parser_cached.cache_info()

    def __repr__(self):
        return (
            f"SRTMManager(directory={self.srtm_directory}, "
            f"tiles={len(self.tile_index)}, "
            f"cache_size={self._get_parser_cached.cache_info().maxsize}, "
            f"data_store={type(self.data_store).__name__})"
        )
__init__(srtm_directory, downloader=None, cache_size=10, data_store=None)

Initialize the SRTM Manager.

Parameters

srtm_directory : str or Path Directory containing .hgt.zip files downloader : optional Downloader instance for auto-downloading missing tiles cache_size : int, default=10 Maximum number of SRTM tiles to keep in memory (LRU cache) data_store : DataStore, optional Data store for reading files. Priority: provided data_store > downloader.data_store > LocalDataStore()

Source code in gigaspatial/handlers/srtm/srtm_manager.py
def __init__(
    self,
    srtm_directory: Union[str, Path],
    downloader: NasaSRTMDownloader = None,
    cache_size: int = 10,
    data_store: Optional[DataStore] = None,
):
    """
    Initialize the SRTM Manager.

    Parameters
    ----------
    srtm_directory : str or Path
        Directory containing .hgt.zip files
    downloader : optional
        Downloader instance for auto-downloading missing tiles
    cache_size : int, default=10
        Maximum number of SRTM tiles to keep in memory (LRU cache)
    data_store : DataStore, optional
        Data store for reading files. Priority: provided data_store >
        downloader.data_store > LocalDataStore()
    """
    self.srtm_directory = Path(srtm_directory)
    self.downloader = downloader

    # Set data_store: use provided, otherwise downloader's, otherwise LocalDataStore
    if data_store is not None:
        self.data_store = data_store
    elif downloader is not None and hasattr(downloader, "data_store"):
        self.data_store = downloader.data_store
    else:
        self.data_store = LocalDataStore()

    # Check if directory exists
    if not self.data_store.is_dir(str(self.srtm_directory)):
        raise FileNotFoundError(f"Directory not found: {self.srtm_directory}")

    # Build index of available tiles
    self.tile_index = self._build_tile_index()

    # Set up LRU cache for lazy loading
    self._get_parser_cached = lru_cache(maxsize=cache_size)(self._load_parser)
check_coverage(latitude, longitude)

Check if a specific coordinate has SRTM coverage.

Parameters

latitude : float Latitude in decimal degrees longitude : float Longitude in decimal degrees

Returns

bool True if tile is available, False otherwise

Source code in gigaspatial/handlers/srtm/srtm_manager.py
def check_coverage(self, latitude: float, longitude: float) -> bool:
    """
    Check if a specific coordinate has SRTM coverage.

    Parameters
    ----------
    latitude : float
        Latitude in decimal degrees
    longitude : float
        Longitude in decimal degrees

    Returns
    -------
    bool
        True if tile is available, False otherwise
    """
    tile_key = self._get_tile_coordinates(latitude, longitude)
    return tile_key in self.tile_index
clear_cache()

Clear the LRU cache of loaded parsers.

Source code in gigaspatial/handlers/srtm/srtm_manager.py
def clear_cache(self):
    """Clear the LRU cache of loaded parsers."""
    self._get_parser_cached.cache_clear()
get_available_tiles()

Get list of available SRTM tiles.

Returns

list of tuples List of (lat, lon) tile coordinates

Source code in gigaspatial/handlers/srtm/srtm_manager.py
def get_available_tiles(self) -> List[Tuple[int, int]]:
    """
    Get list of available SRTM tiles.

    Returns
    -------
    list of tuples
        List of (lat, lon) tile coordinates
    """
    return list(self.tile_index.keys())
get_cache_info()

Get cache statistics.

Returns

CacheInfo Named tuple with hits, misses, maxsize, currsize

Source code in gigaspatial/handlers/srtm/srtm_manager.py
def get_cache_info(self):
    """
    Get cache statistics.

    Returns
    -------
    CacheInfo
        Named tuple with hits, misses, maxsize, currsize
    """
    return self._get_parser_cached.cache_info()
get_elevation(latitude, longitude)

Get interpolated elevation for a specific coordinate.

Automatically finds and loads the correct SRTM tile.

Parameters

latitude : float Latitude in decimal degrees (-90 to 90) longitude : float Longitude in decimal degrees (-180 to 180)

Returns

float Interpolated elevation in meters

Raises

FileNotFoundError If the required SRTM tile is not available

Source code in gigaspatial/handlers/srtm/srtm_manager.py
def get_elevation(self, latitude: float, longitude: float) -> float:
    """
    Get interpolated elevation for a specific coordinate.

    Automatically finds and loads the correct SRTM tile.

    Parameters
    ----------
    latitude : float
        Latitude in decimal degrees (-90 to 90)
    longitude : float
        Longitude in decimal degrees (-180 to 180)

    Returns
    -------
    float
        Interpolated elevation in meters

    Raises
    ------
    FileNotFoundError
        If the required SRTM tile is not available
    """
    # Get tile coordinates
    lat_tile, lon_tile = self._get_tile_coordinates(latitude, longitude)

    # Load parser (cached)
    parser = self._get_parser_cached(lat_tile, lon_tile)

    # Get elevation
    return parser.get_elevation(latitude, longitude)
get_elevation_batch(coordinates)

Get elevations for multiple coordinates efficiently.

Groups coordinates by tile to minimize parser loads.

Parameters

coordinates : np.ndarray of shape (n, 2) Array of (latitude, longitude) pairs

Returns

np.ndarray of shape (n,) Elevations in meters

Raises

FileNotFoundError If any required SRTM tile is not available

Source code in gigaspatial/handlers/srtm/srtm_manager.py
def get_elevation_batch(self, coordinates: np.ndarray) -> np.ndarray:
    """
    Get elevations for multiple coordinates efficiently.

    Groups coordinates by tile to minimize parser loads.

    Parameters
    ----------
    coordinates : np.ndarray of shape (n, 2)
        Array of (latitude, longitude) pairs

    Returns
    -------
    np.ndarray of shape (n,)
        Elevations in meters

    Raises
    ------
    FileNotFoundError
        If any required SRTM tile is not available
    """
    elevations = np.zeros(len(coordinates))

    # Group coordinates by tile
    tile_groups = {}
    for i, (lat, lon) in enumerate(coordinates):
        tile_key = self._get_tile_coordinates(lat, lon)
        if tile_key not in tile_groups:
            tile_groups[tile_key] = []
        tile_groups[tile_key].append((i, lat, lon))

    # Process each tile group
    for tile_key, coords_list in tile_groups.items():
        parser = self._get_parser_cached(*tile_key)

        # Extract coordinates for this tile
        indices = [c[0] for c in coords_list]
        tile_coords = np.array([[c[1], c[2]] for c in coords_list])

        # Get elevations
        tile_elevations = parser.get_elevation_batch(tile_coords)

        # Store results
        elevations[indices] = tile_elevations

    return elevations
get_elevation_profile(start_lat, start_lon, end_lat, end_lon, num_points=100)

Get elevation profile between two points.

Uses linear interpolation between points and automatically handles multiple SRTM tiles. For more accurate great circle paths over long distances, consider using geopy.

Parameters

start_lat : float Starting latitude in decimal degrees start_lon : float Starting longitude in decimal degrees end_lat : float Ending latitude in decimal degrees end_lon : float Ending longitude in decimal degrees num_points : int, default=100 Number of sample points along the path

Returns

pd.DataFrame DataFrame with columns: distance_km, latitude, longitude, elevation

Raises

FileNotFoundError If any required SRTM tile along the path is not available

Source code in gigaspatial/handlers/srtm/srtm_manager.py
def get_elevation_profile(
    self,
    start_lat: float,
    start_lon: float,
    end_lat: float,
    end_lon: float,
    num_points: int = 100,
) -> pd.DataFrame:
    """
    Get elevation profile between two points.

    Uses linear interpolation between points and automatically handles multiple SRTM tiles.
    For more accurate great circle paths over long distances, consider using geopy.

    Parameters
    ----------
    start_lat : float
        Starting latitude in decimal degrees
    start_lon : float
        Starting longitude in decimal degrees
    end_lat : float
        Ending latitude in decimal degrees
    end_lon : float
        Ending longitude in decimal degrees
    num_points : int, default=100
        Number of sample points along the path

    Returns
    -------
    pd.DataFrame
        DataFrame with columns: distance_km, latitude, longitude, elevation

    Raises
    ------
    FileNotFoundError
        If any required SRTM tile along the path is not available
    """
    # Generate points along the path (linear interpolation)
    lats = np.linspace(start_lat, end_lat, num_points)
    lons = np.linspace(start_lon, end_lon, num_points)

    coordinates = np.column_stack((lats, lons))

    # Get elevations for all points
    elevations = self.get_elevation_batch(coordinates)

    # Calculate distances using Haversine formula
    distances = self._calculate_cumulative_distances(lats, lons)

    # Create DataFrame
    profile = pd.DataFrame(
        {
            "distance_km": distances,
            "latitude": lats,
            "longitude": lons,
            "elevation": elevations,
        }
    )

    return profile

srtm_parser

SRTMParser

Efficient parser for NASA SRTM .hgt.zip files.

Supports both SRTM-1 (3601x3601, 1 arc-second) and SRTM-3 (1201x1201, 3 arc-second) formats. Uses memory mapping for efficient handling of large files.

Source code in gigaspatial/handlers/srtm/srtm_parser.py
class SRTMParser:
    """
    Efficient parser for NASA SRTM .hgt.zip files.

    Supports both SRTM-1 (3601x3601, 1 arc-second) and SRTM-3 (1201x1201, 3 arc-second) formats.
    Uses memory mapping for efficient handling of large files.
    """

    def __init__(
        self, hgt_zip_path: Union[str, Path], data_store: Optional[DataStore] = None
    ):
        """
        Initialize the SRTM parser.

        Parameters
        ----------
        hgt_zip_path : str or Path
            Path to the .hgt.zip file (e.g., 'S03E028.SRTMGL1.hgt.zip')
        data_store : DataStore, optional
            Data store for reading files. If None, uses LocalDataStore()
        """
        self.hgt_zip_path = Path(hgt_zip_path)
        self.data_store = data_store or LocalDataStore()

        # Check if file exists
        if not self.data_store.file_exists(str(self.hgt_zip_path)):
            raise FileNotFoundError(f"File not found: {self.hgt_zip_path}")

        # Extract tile coordinates from filename (e.g., S03E028)
        self._parse_filename()

        # Load the elevation data
        self.data = None
        self.resolution = None
        self.size = None
        self._load_data()

        # Set up interpolator for efficient querying
        self._setup_interpolator()

    def _parse_filename(self):
        """Extract latitude and longitude from the .hgt filename."""
        filename = self.hgt_zip_path.stem.split(".")[
            0
        ]  # Get base name without extensions

        # Parse latitude (first 3 characters: N/S + 2 digits)
        lat_str = filename[:3]
        lat_dir = lat_str[0]
        lat_val = int(lat_str[1:])
        self.lat_corner = lat_val if lat_dir == "N" else -lat_val

        # Parse longitude (next 4 characters: E/W + 3 digits)
        lon_str = filename[3:7]
        lon_dir = lon_str[0]
        lon_val = int(lon_str[1:])
        self.lon_corner = lon_val if lon_dir == "E" else -lon_val

    def _load_data(self):
        """Load elevation data from .hgt.zip file using memory-efficient approach."""
        # Read the zip file from DataStore
        zip_data = self.data_store.read_file(str(self.hgt_zip_path))

        # Create a BytesIO object from the zip data
        zip_file_obj = io.BytesIO(zip_data)

        # Extract .hgt file from zip
        with zipfile.ZipFile(zip_file_obj, "r") as zip_ref:
            # Find the .hgt file inside the zip
            hgt_files = [f for f in zip_ref.namelist() if f.endswith(".hgt")]

            if not hgt_files:
                raise ValueError(f"No .hgt file found in {self.hgt_zip_path}")

            hgt_filename = hgt_files[0]

            # Read the binary data
            with zip_ref.open(hgt_filename) as hgt_file:
                hgt_data = hgt_file.read()

        # Determine resolution based on file size
        file_size = len(hgt_data)

        if file_size == 25934402:  # 3601 * 3601 * 2 bytes (SRTM-1, 1 arc-second)
            self.size = 3601
            self.resolution = 1 / 3600  # degrees
        elif file_size == 2884802:  # 1201 * 1201 * 2 bytes (SRTM-3, 3 arc-second)
            self.size = 1201
            self.resolution = 3 / 3600  # degrees
        else:
            raise ValueError(f"Unexpected file size: {file_size} bytes")

        # Parse binary data as big-endian 16-bit signed integers
        # Using numpy for efficiency
        self.data = np.frombuffer(hgt_data, dtype=">i2").reshape((self.size, self.size))

        # Replace void values (-32768) with NaN
        self.data = self.data.astype(np.float32)
        self.data[self.data == -32768] = np.nan

    def _setup_interpolator(self):
        """Set up RegularGridInterpolator for efficient elevation queries."""
        # Create coordinate arrays
        # Note: SRTM data is stored from north to south (top to bottom)
        lats = np.linspace(
            self.lat_corner + 1, self.lat_corner, self.size  # North edge  # South edge
        )
        lons = np.linspace(
            self.lon_corner, self.lon_corner + 1, self.size  # West edge  # East edge
        )

        self.interpolator = RegularGridInterpolator(
            (lats, lons),
            self.data,
            method="linear",
            bounds_error=False,
            fill_value=np.nan,
        )

        # Store coordinate arrays for reference
        self.lats = lats
        self.lons = lons

    def to_dataframe(self, dropna=True) -> pd.DataFrame:
        """
        Convert elevation data to a DataFrame with coordinates.

        Returns
        -------
        pd.DataFrame
            DataFrame with columns: latitude, longitude, elevation
        """
        # Create meshgrid of coordinates
        lon_grid, lat_grid = np.meshgrid(self.lons, self.lats)

        # Flatten arrays
        df = pd.DataFrame(
            {
                "latitude": lat_grid.ravel(),
                "longitude": lon_grid.ravel(),
                "elevation": self.data.ravel(),
            }
        )

        return df.dropna(subset=["elevation"]) if dropna else df

    def to_array(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """
        Return elevation data in square array form with coordinate arrays.

        Returns
        -------
        tuple of (elevation_array, latitudes, longitudes)
            elevation_array : np.ndarray of shape (size, size)
                2D array of elevation values in meters
            latitudes : np.ndarray of shape (size,)
                Latitude values for each row (north to south)
            longitudes : np.ndarray of shape (size,)
                Longitude values for each column (west to east)
        """
        return self.data.copy(), self.lats.copy(), self.lons.copy()

    def get_elevation(self, latitude: float, longitude: float) -> float:
        """
        Get interpolated elevation for a specific coordinate.

        Uses bilinear interpolation for accurate elevation values between grid points.

        Parameters
        ----------
        latitude : float
            Latitude in decimal degrees
        longitude : float
            Longitude in decimal degrees

        Returns
        -------
        float
            Interpolated elevation in meters, or np.nan if outside tile bounds
        """
        # Check if coordinates are within tile bounds
        if not (self.lat_corner <= latitude <= self.lat_corner + 1):
            return np.nan
        if not (self.lon_corner <= longitude <= self.lon_corner + 1):
            return np.nan

        # Use interpolator for bilinear interpolation
        elevation = self.interpolator([[latitude, longitude]])[0]

        return float(elevation)

    def get_elevation_batch(self, coordinates: np.ndarray) -> np.ndarray:
        """
        Get interpolated elevations for multiple coordinates (vectorized).

        Parameters
        ----------
        coordinates : np.ndarray of shape (n, 2)
            Array of (latitude, longitude) pairs

        Returns
        -------
        np.ndarray of shape (n,)
            Interpolated elevations in meters
        """
        return self.interpolator(coordinates)

    def get_tile_info(self) -> dict:
        """
        Get information about the SRTM tile.

        Returns
        -------
        dict
            Dictionary containing tile metadata
        """
        return {
            "filename": self.hgt_zip_path.name,
            "lat_corner": self.lat_corner,
            "lon_corner": self.lon_corner,
            "lat_range": (self.lat_corner, self.lat_corner + 1),
            "lon_range": (self.lon_corner, self.lon_corner + 1),
            "resolution_arcsec": 1 if self.size == 3601 else 3,
            "resolution_deg": self.resolution,
            "size": (self.size, self.size),
            "min_elevation": float(np.nanmin(self.data)),
            "max_elevation": float(np.nanmax(self.data)),
            "mean_elevation": float(np.nanmean(self.data)),
            "void_percentage": float(np.isnan(self.data).sum() / self.data.size * 100),
        }

    def __repr__(self):
        return (
            f"SRTMParser(tile={self.lat_corner:+03d}{self.lon_corner:+04d}, "
            f"resolution={self.resolution*3600:.0f}arcsec, "
            f"size={self.size}x{self.size}, "
            f"data_store={type(self.data_store).__name__})"
        )
__init__(hgt_zip_path, data_store=None)

Initialize the SRTM parser.

Parameters

hgt_zip_path : str or Path Path to the .hgt.zip file (e.g., 'S03E028.SRTMGL1.hgt.zip') data_store : DataStore, optional Data store for reading files. If None, uses LocalDataStore()

Source code in gigaspatial/handlers/srtm/srtm_parser.py
def __init__(
    self, hgt_zip_path: Union[str, Path], data_store: Optional[DataStore] = None
):
    """
    Initialize the SRTM parser.

    Parameters
    ----------
    hgt_zip_path : str or Path
        Path to the .hgt.zip file (e.g., 'S03E028.SRTMGL1.hgt.zip')
    data_store : DataStore, optional
        Data store for reading files. If None, uses LocalDataStore()
    """
    self.hgt_zip_path = Path(hgt_zip_path)
    self.data_store = data_store or LocalDataStore()

    # Check if file exists
    if not self.data_store.file_exists(str(self.hgt_zip_path)):
        raise FileNotFoundError(f"File not found: {self.hgt_zip_path}")

    # Extract tile coordinates from filename (e.g., S03E028)
    self._parse_filename()

    # Load the elevation data
    self.data = None
    self.resolution = None
    self.size = None
    self._load_data()

    # Set up interpolator for efficient querying
    self._setup_interpolator()
get_elevation(latitude, longitude)

Get interpolated elevation for a specific coordinate.

Uses bilinear interpolation for accurate elevation values between grid points.

Parameters

latitude : float Latitude in decimal degrees longitude : float Longitude in decimal degrees

Returns

float Interpolated elevation in meters, or np.nan if outside tile bounds

Source code in gigaspatial/handlers/srtm/srtm_parser.py
def get_elevation(self, latitude: float, longitude: float) -> float:
    """
    Get interpolated elevation for a specific coordinate.

    Uses bilinear interpolation for accurate elevation values between grid points.

    Parameters
    ----------
    latitude : float
        Latitude in decimal degrees
    longitude : float
        Longitude in decimal degrees

    Returns
    -------
    float
        Interpolated elevation in meters, or np.nan if outside tile bounds
    """
    # Check if coordinates are within tile bounds
    if not (self.lat_corner <= latitude <= self.lat_corner + 1):
        return np.nan
    if not (self.lon_corner <= longitude <= self.lon_corner + 1):
        return np.nan

    # Use interpolator for bilinear interpolation
    elevation = self.interpolator([[latitude, longitude]])[0]

    return float(elevation)
get_elevation_batch(coordinates)

Get interpolated elevations for multiple coordinates (vectorized).

Parameters

coordinates : np.ndarray of shape (n, 2) Array of (latitude, longitude) pairs

Returns

np.ndarray of shape (n,) Interpolated elevations in meters

Source code in gigaspatial/handlers/srtm/srtm_parser.py
def get_elevation_batch(self, coordinates: np.ndarray) -> np.ndarray:
    """
    Get interpolated elevations for multiple coordinates (vectorized).

    Parameters
    ----------
    coordinates : np.ndarray of shape (n, 2)
        Array of (latitude, longitude) pairs

    Returns
    -------
    np.ndarray of shape (n,)
        Interpolated elevations in meters
    """
    return self.interpolator(coordinates)
get_tile_info()

Get information about the SRTM tile.

Returns

dict Dictionary containing tile metadata

Source code in gigaspatial/handlers/srtm/srtm_parser.py
def get_tile_info(self) -> dict:
    """
    Get information about the SRTM tile.

    Returns
    -------
    dict
        Dictionary containing tile metadata
    """
    return {
        "filename": self.hgt_zip_path.name,
        "lat_corner": self.lat_corner,
        "lon_corner": self.lon_corner,
        "lat_range": (self.lat_corner, self.lat_corner + 1),
        "lon_range": (self.lon_corner, self.lon_corner + 1),
        "resolution_arcsec": 1 if self.size == 3601 else 3,
        "resolution_deg": self.resolution,
        "size": (self.size, self.size),
        "min_elevation": float(np.nanmin(self.data)),
        "max_elevation": float(np.nanmax(self.data)),
        "mean_elevation": float(np.nanmean(self.data)),
        "void_percentage": float(np.isnan(self.data).sum() / self.data.size * 100),
    }
to_array()

Return elevation data in square array form with coordinate arrays.

Returns

tuple of (elevation_array, latitudes, longitudes) elevation_array : np.ndarray of shape (size, size) 2D array of elevation values in meters latitudes : np.ndarray of shape (size,) Latitude values for each row (north to south) longitudes : np.ndarray of shape (size,) Longitude values for each column (west to east)

Source code in gigaspatial/handlers/srtm/srtm_parser.py
def to_array(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Return elevation data in square array form with coordinate arrays.

    Returns
    -------
    tuple of (elevation_array, latitudes, longitudes)
        elevation_array : np.ndarray of shape (size, size)
            2D array of elevation values in meters
        latitudes : np.ndarray of shape (size,)
            Latitude values for each row (north to south)
        longitudes : np.ndarray of shape (size,)
            Longitude values for each column (west to east)
    """
    return self.data.copy(), self.lats.copy(), self.lons.copy()
to_dataframe(dropna=True)

Convert elevation data to a DataFrame with coordinates.

Returns

pd.DataFrame DataFrame with columns: latitude, longitude, elevation

Source code in gigaspatial/handlers/srtm/srtm_parser.py
def to_dataframe(self, dropna=True) -> pd.DataFrame:
    """
    Convert elevation data to a DataFrame with coordinates.

    Returns
    -------
    pd.DataFrame
        DataFrame with columns: latitude, longitude, elevation
    """
    # Create meshgrid of coordinates
    lon_grid, lat_grid = np.meshgrid(self.lons, self.lats)

    # Flatten arrays
    df = pd.DataFrame(
        {
            "latitude": lat_grid.ravel(),
            "longitude": lon_grid.ravel(),
            "elevation": self.data.ravel(),
        }
    )

    return df.dropna(subset=["elevation"]) if dropna else df

utils

EarthdataSession

Bases: Session

Custom requests.Session for NASA Earthdata authentication.

Maintains Authorization headers through redirects to/from Earthdata hosts. This is required because Earthdata uses multiple redirect domains during authentication.

Source code in gigaspatial/handlers/srtm/utils.py
class EarthdataSession(requests.Session):
    """
    Custom requests.Session for NASA Earthdata authentication.

    Maintains Authorization headers through redirects to/from Earthdata hosts.
    This is required because Earthdata uses multiple redirect domains during authentication.
    """

    AUTH_HOST = "urs.earthdata.nasa.gov"

    def __init__(self, username: str, password: str):
        super().__init__()
        self.auth = (username, password)

    def rebuild_auth(self, prepared_request, response):
        """Keep auth header on redirects to/from Earthdata host."""
        headers = prepared_request.headers
        url = prepared_request.url

        if "Authorization" in headers:
            original_parsed = requests.utils.urlparse(response.request.url)
            redirect_parsed = requests.utils.urlparse(url)

            # remove Authorization only if redirecting *away from* Earthdata
            if (
                (original_parsed.hostname != redirect_parsed.hostname)
                and redirect_parsed.hostname != self.AUTH_HOST
                and original_parsed.hostname != self.AUTH_HOST
            ):
                del headers["Authorization"]
rebuild_auth(prepared_request, response)

Keep auth header on redirects to/from Earthdata host.

Source code in gigaspatial/handlers/srtm/utils.py
def rebuild_auth(self, prepared_request, response):
    """Keep auth header on redirects to/from Earthdata host."""
    headers = prepared_request.headers
    url = prepared_request.url

    if "Authorization" in headers:
        original_parsed = requests.utils.urlparse(response.request.url)
        redirect_parsed = requests.utils.urlparse(url)

        # remove Authorization only if redirecting *away from* Earthdata
        if (
            (original_parsed.hostname != redirect_parsed.hostname)
            and redirect_parsed.hostname != self.AUTH_HOST
            and original_parsed.hostname != self.AUTH_HOST
        ):
            del headers["Authorization"]

unicef_georepo

GeoRepoClient

A client for interacting with the GeoRepo API.

GeoRepo is a platform for managing and accessing geospatial administrative boundary data. This client provides methods to search, retrieve, and work with modules, datasets, views, and administrative entities.

Attributes:

Name Type Description
base_url str

The base URL for the GeoRepo API

api_key str

The API key for authentication

email str

The email address associated with the API key

headers dict

HTTP headers used for API requests

Source code in gigaspatial/handlers/unicef_georepo.py
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
class GeoRepoClient:
    """
    A client for interacting with the GeoRepo API.

    GeoRepo is a platform for managing and accessing geospatial administrative
    boundary data. This client provides methods to search, retrieve, and work
    with modules, datasets, views, and administrative entities.

    Attributes:
        base_url (str): The base URL for the GeoRepo API
        api_key (str): The API key for authentication
        email (str): The email address associated with the API key
        headers (dict): HTTP headers used for API requests
    """

    def __init__(self, api_key=None, email=None):
        """
        Initialize the GeoRepo client.

        Args:
            api_key (str, optional): GeoRepo API key. If not provided, will use
                the GEOREPO_API_KEY environment variable from config.
            email (str, optional): Email address associated with the API key.
                If not provided, will use the GEOREPO_USER_EMAIL environment
                variable from config.

        Raises:
            ValueError: If api_key or email is not provided and cannot be found
                in environment variables.
        """
        self.base_url = "https://georepo.unicef.org/api/v1"
        self.api_key = api_key or config.GEOREPO_API_KEY
        self.email = email or config.GEOREPO_USER_EMAIL
        self.logger = config.get_logger(self.__class__.__name__)

        if not self.api_key:
            raise ValueError(
                "API Key is required. Provide it as a parameter or set GEOREPO_API_KEY environment variable."
            )

        if not self.email:
            raise ValueError(
                "Email is required. Provide it as a parameter or set GEOREPO_USER_EMAIL environment variable."
            )

        self.headers = {
            "Accept": "application/json",
            "Authorization": f"Token {self.api_key}",
            "GeoRepo-User-Key": self.email,
        }

    def _make_request(self, method, endpoint, params=None, data=None):
        """Internal method to handle making HTTP requests."""
        try:
            response = requests.request(
                method, endpoint, headers=self.headers, params=params, json=data
            )
            response.raise_for_status()
            return response
        except requests.exceptions.RequestException as e:
            raise requests.exceptions.HTTPError(f"API request failed: {e}")

    def check_connection(self):
        """
        Checks if the API connection is valid by making a simple request.

        Returns:
            bool: True if the connection is valid, False otherwise.
        """
        endpoint = f"{self.base_url}/search/module/list/"
        try:
            self._make_request("GET", endpoint)
            return True
        except requests.exceptions.HTTPError as e:
            return False
        except requests.exceptions.RequestException as e:
            raise requests.exceptions.RequestException(
                f"Connection check encountered a network error: {e}"
            )

    def list_modules(self):
        """
        List all available modules in GeoRepo.

        A module is a top-level organizational unit that contains datasets.
        Examples include "Admin Boundaries", "Health Facilities", etc.

        Returns:
            dict: JSON response containing a list of modules with their metadata.
                Each module includes 'uuid', 'name', 'description', and other properties.

        Raises:
            requests.HTTPError: If the API request fails.
        """
        endpoint = f"{self.base_url}/search/module/list/"
        response = self._make_request("GET", endpoint)
        return response.json()

    def list_datasets_by_module(self, module_uuid):
        """
        List all datasets within a specific module.

        A dataset represents a collection of related geographic entities,
        such as administrative boundaries for a specific country or region.

        Args:
            module_uuid (str): The UUID of the module to query.

        Returns:
            dict: JSON response containing a list of datasets with their metadata.
                Each dataset includes 'uuid', 'name', 'description', creation date, etc.

        Raises:
            requests.HTTPError: If the API request fails or module_uuid is invalid.
        """
        endpoint = f"{self.base_url}/search/module/{module_uuid}/dataset/list/"
        response = self._make_request("GET", endpoint)
        return response.json()

    def get_dataset_details(self, dataset_uuid):
        """
        Get detailed information about a specific dataset.

        This includes metadata about the dataset and information about
        available administrative levels (e.g., country, province, district).

        Args:
            dataset_uuid (str): The UUID of the dataset to query.

        Returns:
            dict: JSON response containing dataset details including:
                - Basic metadata (name, description, etc.)
                - Available administrative levels and their properties
                - Temporal information and data sources

        Raises:
            requests.HTTPError: If the API request fails or dataset_uuid is invalid.
        """
        endpoint = f"{self.base_url}/search/dataset/{dataset_uuid}/"
        response = self._make_request("GET", endpoint)
        return response.json()

    def list_views_by_dataset(self, dataset_uuid, page=1, page_size=50):
        """
        List views for a dataset with pagination support.

        A view represents a specific version or subset of a dataset.
        Views may be tagged as 'latest' or represent different time periods.

        Args:
            dataset_uuid (str): The UUID of the dataset to query.
            page (int, optional): Page number for pagination. Defaults to 1.
            page_size (int, optional): Number of results per page. Defaults to 50.

        Returns:
            dict: JSON response containing paginated list of views with metadata.
                Includes 'results', 'total_page', 'current_page', and 'count' fields.
                Each view includes 'uuid', 'name', 'tags', and other properties.

        Raises:
            requests.HTTPError: If the API request fails or dataset_uuid is invalid.
        """
        endpoint = f"{self.base_url}/search/dataset/{dataset_uuid}/view/list/"
        params = {"page": page, "page_size": page_size}
        response = self._make_request("GET", endpoint, params=params)
        return response.json()

    def list_entities_by_admin_level(
        self,
        view_uuid,
        admin_level,
        geom="no_geom",
        format="json",
        page=1,
        page_size=50,
    ):
        """
        List entities at a specific administrative level within a view.

        Administrative levels typically follow a hierarchy:
        - Level 0: Countries
        - Level 1: States/Provinces/Regions
        - Level 2: Districts/Counties
        - Level 3: Sub-districts/Municipalities
        - And so on...

        Args:
            view_uuid (str): The UUID of the view to query.
            admin_level (int): The administrative level to retrieve (0, 1, 2, etc.).
            geom (str, optional): Geometry inclusion level. Options:
                - "no_geom": No geometry data
                - "centroid": Only centroid points
                - "full_geom": Complete boundary geometries
                Defaults to "no_geom".
            format (str, optional): Response format ("json" or "geojson").
                Defaults to "json".
            page (int, optional): Page number for pagination. Defaults to 1.
            page_size (int, optional): Number of results per page. Defaults to 50.

        Returns:
            tuple: A tuple containing:
                - dict: JSON/GeoJSON response with entity data
                - dict: Metadata with pagination info (page, total_page, total_count)

        Raises:
            requests.HTTPError: If the API request fails or parameters are invalid.
        """
        endpoint = (
            f"{self.base_url}/search/view/{view_uuid}/entity/level/{admin_level}/"
        )
        params = {"page": page, "page_size": page_size, "geom": geom, "format": format}
        response = self._make_request("GET", endpoint, params=params)

        metadata = {
            "page": int(response.headers.get("page", 1)),
            "total_page": int(response.headers.get("total_page", 1)),
            "total_count": int(response.headers.get("count", 0)),
        }

        return response.json(), metadata

    def get_entity_by_ucode(self, ucode, geom="full_geom", format="geojson"):
        """
        Get detailed information about a specific entity using its Ucode.

        A Ucode (Universal Code) is a unique identifier for geographic entities
        within the GeoRepo system, typically in the format "ISO3_LEVEL_NAME".

        Args:
            ucode (str): The unique code identifier for the entity.
            geom (str, optional): Geometry inclusion level. Options:
                - "no_geom": No geometry data
                - "centroid": Only centroid points
                - "full_geom": Complete boundary geometries
                Defaults to "full_geom".
            format (str, optional): Response format ("json" or "geojson").
                Defaults to "geojson".

        Returns:
            dict: JSON/GeoJSON response containing entity details including
                geometry, properties, administrative level, and metadata.

        Raises:
            requests.HTTPError: If the API request fails or ucode is invalid.
        """
        endpoint = f"{self.base_url}/search/entity/ucode/{ucode}/"
        params = {"geom": geom, "format": format}
        response = self._make_request("GET", endpoint, params=params)
        return response.json()

    def list_entity_children(
        self, view_uuid, entity_ucode, geom="no_geom", format="json"
    ):
        """
        List direct children of an entity in the administrative hierarchy.

        For example, if given a country entity, this will return its states/provinces.
        If given a state entity, this will return its districts/counties.

        Args:
            view_uuid (str): The UUID of the view containing the entity.
            entity_ucode (str): The Ucode of the parent entity.
            geom (str, optional): Geometry inclusion level. Options:
                - "no_geom": No geometry data
                - "centroid": Only centroid points
                - "full_geom": Complete boundary geometries
                Defaults to "no_geom".
            format (str, optional): Response format ("json" or "geojson").
                Defaults to "json".

        Returns:
            dict: JSON/GeoJSON response containing list of child entities
                with their properties and optional geometry data.

        Raises:
            requests.HTTPError: If the API request fails or parameters are invalid.
        """
        endpoint = (
            f"{self.base_url}/search/view/{view_uuid}/entity/{entity_ucode}/children/"
        )
        params = {"geom": geom, "format": format}
        response = self._make_request("GET", endpoint, params=params)
        return response.json()

    def search_entities_by_name(self, view_uuid, name, page=1, page_size=50):
        """
        Search for entities by name using fuzzy matching.

        This performs a similarity-based search to find entities whose names
        match or are similar to the provided search term.

        Args:
            view_uuid (str): The UUID of the view to search within.
            name (str): The name or partial name to search for.
            page (int, optional): Page number for pagination. Defaults to 1.
            page_size (int, optional): Number of results per page. Defaults to 50.

        Returns:
            dict: JSON response containing paginated search results with
                matching entities and their similarity scores.

        Raises:
            requests.HTTPError: If the API request fails or parameters are invalid.
        """
        endpoint = f"{self.base_url}/search/view/{view_uuid}/entity/{name}/"
        params = {"page": page, "page_size": page_size}
        response = self._make_request("GET", endpoint, params=params)
        return response.json()

    def get_admin_boundaries(
        self, view_uuid, admin_level=None, geom="full_geom", format="geojson"
    ):
        """
        Get administrative boundaries for a specific level or all levels.

        This is a convenience method that can retrieve boundaries for a single
        administrative level or attempt to fetch all available levels.

        Args:
            view_uuid (str): The UUID of the view to query.
            admin_level (int, optional): Administrative level to retrieve
                (0=country, 1=region, etc.). If None, attempts to fetch all levels.
            geom (str, optional): Geometry inclusion level. Options:
                - "no_geom": No geometry data
                - "centroid": Only centroid points
                - "full_geom": Complete boundary geometries
                Defaults to "full_geom".
            format (str, optional): Response format ("json" or "geojson").
                Defaults to "geojson".

        Returns:
            dict: JSON/GeoJSON response containing administrative boundaries
                in the specified format. For GeoJSON, returns a FeatureCollection.

        Raises:
            requests.HTTPError: If the API request fails or parameters are invalid.
        """
        # Construct the endpoint based on whether admin_level is provided
        if admin_level is not None:
            endpoint = (
                f"{self.base_url}/search/view/{view_uuid}/entity/level/{admin_level}/"
            )
        else:
            # For all levels, we need to fetch level 0 and then get children for each entity
            endpoint = f"{self.base_url}/search/view/{view_uuid}/entity/list/"

        params = {
            "geom": geom,
            "format": format,
            "page_size": 100,
        }

        response = self._make_request("GET", endpoint, params=params)
        return response.json()

    def get_vector_tiles_url(self, view_info):
        """
        Generate an authenticated URL for accessing vector tiles.

        Vector tiles are used for efficient map rendering and can be consumed
        by mapping libraries like Mapbox GL JS or OpenLayers.

        Args:
            view_info (dict): Dictionary containing view information that must
                include a 'vector_tiles' key with the base vector tiles URL.

        Returns:
            str: Fully authenticated vector tiles URL with API key and user email
                parameters appended for access control.

        Raises:
            ValueError: If 'vector_tiles' key is not found in view_info.
        """
        if "vector_tiles" not in view_info:
            raise ValueError("Vector tiles URL not found in view information")

        vector_tiles_url = view_info["vector_tiles"]

        # Parse out the timestamp parameter if it exists
        if "?t=" in vector_tiles_url:
            base_url, timestamp = vector_tiles_url.split("?t=")
            return f"{base_url}?t={timestamp}&token={self.api_key}&georepo_user_key={self.email}"
        else:
            return (
                f"{vector_tiles_url}?token={self.api_key}&georepo_user_key={self.email}"
            )

    def find_country_by_iso3(self, view_uuid, iso3_code):
        """
        Find a country entity using its ISO3 country code.

        This method searches through all level-0 (country) entities to find
        one that matches the provided ISO3 code. It checks both the entity's
        Ucode and any external codes stored in the ext_codes field.

        Args:
            view_uuid (str): The UUID of the view to search within.
            iso3_code (str): The ISO3 country code to search for (e.g., 'USA', 'KEN', 'BRA').

        Returns:
            dict or None: Entity information dictionary for the matching country
                if found, including properties like name, ucode, admin_level, etc.
                Returns None if no matching country is found.

        Note:
            This method handles pagination automatically to search through all
            available countries in the dataset, which may involve multiple API calls.

        Raises:
            requests.HTTPError: If the API request fails or view_uuid is invalid.
        """
        # Admin level 0 represents countries
        endpoint = f"{self.base_url}/search/view/{view_uuid}/entity/level/0/"
        params = {
            "page_size": 100,
            "geom": "no_geom",
        }

        # need to paginate since it can be a large dataset
        all_countries = []
        page = 1

        while True:
            params["page"] = page
            response = self._make_request("GET", endpoint, params=params)
            data = response.json()

            countries = data.get("results", [])
            all_countries.extend(countries)

            # check if there are more pages
            if page >= data.get("total_page", 1):
                break

            page += 1

        # Search by ISO3 code
        for country in all_countries:
            # Check if ISO3 code is in the ucode (typically at the beginning)
            if country["ucode"].startswith(iso3_code + "_"):
                return country

            # Also check in ext_codes which may contain the ISO3 code
            ext_codes = country.get("ext_codes", {})
            if ext_codes:
                # Check if ISO3 is directly in ext_codes
                if (
                    ext_codes.get("PCode", "") == iso3_code
                    or ext_codes.get("default", "") == iso3_code
                ):
                    return country

        return None
__init__(api_key=None, email=None)

Initialize the GeoRepo client.

Parameters:

Name Type Description Default
api_key str

GeoRepo API key. If not provided, will use the GEOREPO_API_KEY environment variable from config.

None
email str

Email address associated with the API key. If not provided, will use the GEOREPO_USER_EMAIL environment variable from config.

None

Raises:

Type Description
ValueError

If api_key or email is not provided and cannot be found in environment variables.

Source code in gigaspatial/handlers/unicef_georepo.py
def __init__(self, api_key=None, email=None):
    """
    Initialize the GeoRepo client.

    Args:
        api_key (str, optional): GeoRepo API key. If not provided, will use
            the GEOREPO_API_KEY environment variable from config.
        email (str, optional): Email address associated with the API key.
            If not provided, will use the GEOREPO_USER_EMAIL environment
            variable from config.

    Raises:
        ValueError: If api_key or email is not provided and cannot be found
            in environment variables.
    """
    self.base_url = "https://georepo.unicef.org/api/v1"
    self.api_key = api_key or config.GEOREPO_API_KEY
    self.email = email or config.GEOREPO_USER_EMAIL
    self.logger = config.get_logger(self.__class__.__name__)

    if not self.api_key:
        raise ValueError(
            "API Key is required. Provide it as a parameter or set GEOREPO_API_KEY environment variable."
        )

    if not self.email:
        raise ValueError(
            "Email is required. Provide it as a parameter or set GEOREPO_USER_EMAIL environment variable."
        )

    self.headers = {
        "Accept": "application/json",
        "Authorization": f"Token {self.api_key}",
        "GeoRepo-User-Key": self.email,
    }
check_connection()

Checks if the API connection is valid by making a simple request.

Returns:

Name Type Description
bool

True if the connection is valid, False otherwise.

Source code in gigaspatial/handlers/unicef_georepo.py
def check_connection(self):
    """
    Checks if the API connection is valid by making a simple request.

    Returns:
        bool: True if the connection is valid, False otherwise.
    """
    endpoint = f"{self.base_url}/search/module/list/"
    try:
        self._make_request("GET", endpoint)
        return True
    except requests.exceptions.HTTPError as e:
        return False
    except requests.exceptions.RequestException as e:
        raise requests.exceptions.RequestException(
            f"Connection check encountered a network error: {e}"
        )
find_country_by_iso3(view_uuid, iso3_code)

Find a country entity using its ISO3 country code.

This method searches through all level-0 (country) entities to find one that matches the provided ISO3 code. It checks both the entity's Ucode and any external codes stored in the ext_codes field.

Parameters:

Name Type Description Default
view_uuid str

The UUID of the view to search within.

required
iso3_code str

The ISO3 country code to search for (e.g., 'USA', 'KEN', 'BRA').

required

Returns:

Type Description

dict or None: Entity information dictionary for the matching country if found, including properties like name, ucode, admin_level, etc. Returns None if no matching country is found.

Note

This method handles pagination automatically to search through all available countries in the dataset, which may involve multiple API calls.

Raises:

Type Description
HTTPError

If the API request fails or view_uuid is invalid.

Source code in gigaspatial/handlers/unicef_georepo.py
def find_country_by_iso3(self, view_uuid, iso3_code):
    """
    Find a country entity using its ISO3 country code.

    This method searches through all level-0 (country) entities to find
    one that matches the provided ISO3 code. It checks both the entity's
    Ucode and any external codes stored in the ext_codes field.

    Args:
        view_uuid (str): The UUID of the view to search within.
        iso3_code (str): The ISO3 country code to search for (e.g., 'USA', 'KEN', 'BRA').

    Returns:
        dict or None: Entity information dictionary for the matching country
            if found, including properties like name, ucode, admin_level, etc.
            Returns None if no matching country is found.

    Note:
        This method handles pagination automatically to search through all
        available countries in the dataset, which may involve multiple API calls.

    Raises:
        requests.HTTPError: If the API request fails or view_uuid is invalid.
    """
    # Admin level 0 represents countries
    endpoint = f"{self.base_url}/search/view/{view_uuid}/entity/level/0/"
    params = {
        "page_size": 100,
        "geom": "no_geom",
    }

    # need to paginate since it can be a large dataset
    all_countries = []
    page = 1

    while True:
        params["page"] = page
        response = self._make_request("GET", endpoint, params=params)
        data = response.json()

        countries = data.get("results", [])
        all_countries.extend(countries)

        # check if there are more pages
        if page >= data.get("total_page", 1):
            break

        page += 1

    # Search by ISO3 code
    for country in all_countries:
        # Check if ISO3 code is in the ucode (typically at the beginning)
        if country["ucode"].startswith(iso3_code + "_"):
            return country

        # Also check in ext_codes which may contain the ISO3 code
        ext_codes = country.get("ext_codes", {})
        if ext_codes:
            # Check if ISO3 is directly in ext_codes
            if (
                ext_codes.get("PCode", "") == iso3_code
                or ext_codes.get("default", "") == iso3_code
            ):
                return country

    return None
get_admin_boundaries(view_uuid, admin_level=None, geom='full_geom', format='geojson')

Get administrative boundaries for a specific level or all levels.

This is a convenience method that can retrieve boundaries for a single administrative level or attempt to fetch all available levels.

Parameters:

Name Type Description Default
view_uuid str

The UUID of the view to query.

required
admin_level int

Administrative level to retrieve (0=country, 1=region, etc.). If None, attempts to fetch all levels.

None
geom str

Geometry inclusion level. Options: - "no_geom": No geometry data - "centroid": Only centroid points - "full_geom": Complete boundary geometries Defaults to "full_geom".

'full_geom'
format str

Response format ("json" or "geojson"). Defaults to "geojson".

'geojson'

Returns:

Name Type Description
dict

JSON/GeoJSON response containing administrative boundaries in the specified format. For GeoJSON, returns a FeatureCollection.

Raises:

Type Description
HTTPError

If the API request fails or parameters are invalid.

Source code in gigaspatial/handlers/unicef_georepo.py
def get_admin_boundaries(
    self, view_uuid, admin_level=None, geom="full_geom", format="geojson"
):
    """
    Get administrative boundaries for a specific level or all levels.

    This is a convenience method that can retrieve boundaries for a single
    administrative level or attempt to fetch all available levels.

    Args:
        view_uuid (str): The UUID of the view to query.
        admin_level (int, optional): Administrative level to retrieve
            (0=country, 1=region, etc.). If None, attempts to fetch all levels.
        geom (str, optional): Geometry inclusion level. Options:
            - "no_geom": No geometry data
            - "centroid": Only centroid points
            - "full_geom": Complete boundary geometries
            Defaults to "full_geom".
        format (str, optional): Response format ("json" or "geojson").
            Defaults to "geojson".

    Returns:
        dict: JSON/GeoJSON response containing administrative boundaries
            in the specified format. For GeoJSON, returns a FeatureCollection.

    Raises:
        requests.HTTPError: If the API request fails or parameters are invalid.
    """
    # Construct the endpoint based on whether admin_level is provided
    if admin_level is not None:
        endpoint = (
            f"{self.base_url}/search/view/{view_uuid}/entity/level/{admin_level}/"
        )
    else:
        # For all levels, we need to fetch level 0 and then get children for each entity
        endpoint = f"{self.base_url}/search/view/{view_uuid}/entity/list/"

    params = {
        "geom": geom,
        "format": format,
        "page_size": 100,
    }

    response = self._make_request("GET", endpoint, params=params)
    return response.json()
get_dataset_details(dataset_uuid)

Get detailed information about a specific dataset.

This includes metadata about the dataset and information about available administrative levels (e.g., country, province, district).

Parameters:

Name Type Description Default
dataset_uuid str

The UUID of the dataset to query.

required

Returns:

Name Type Description
dict

JSON response containing dataset details including: - Basic metadata (name, description, etc.) - Available administrative levels and their properties - Temporal information and data sources

Raises:

Type Description
HTTPError

If the API request fails or dataset_uuid is invalid.

Source code in gigaspatial/handlers/unicef_georepo.py
def get_dataset_details(self, dataset_uuid):
    """
    Get detailed information about a specific dataset.

    This includes metadata about the dataset and information about
    available administrative levels (e.g., country, province, district).

    Args:
        dataset_uuid (str): The UUID of the dataset to query.

    Returns:
        dict: JSON response containing dataset details including:
            - Basic metadata (name, description, etc.)
            - Available administrative levels and their properties
            - Temporal information and data sources

    Raises:
        requests.HTTPError: If the API request fails or dataset_uuid is invalid.
    """
    endpoint = f"{self.base_url}/search/dataset/{dataset_uuid}/"
    response = self._make_request("GET", endpoint)
    return response.json()
get_entity_by_ucode(ucode, geom='full_geom', format='geojson')

Get detailed information about a specific entity using its Ucode.

A Ucode (Universal Code) is a unique identifier for geographic entities within the GeoRepo system, typically in the format "ISO3_LEVEL_NAME".

Parameters:

Name Type Description Default
ucode str

The unique code identifier for the entity.

required
geom str

Geometry inclusion level. Options: - "no_geom": No geometry data - "centroid": Only centroid points - "full_geom": Complete boundary geometries Defaults to "full_geom".

'full_geom'
format str

Response format ("json" or "geojson"). Defaults to "geojson".

'geojson'

Returns:

Name Type Description
dict

JSON/GeoJSON response containing entity details including geometry, properties, administrative level, and metadata.

Raises:

Type Description
HTTPError

If the API request fails or ucode is invalid.

Source code in gigaspatial/handlers/unicef_georepo.py
def get_entity_by_ucode(self, ucode, geom="full_geom", format="geojson"):
    """
    Get detailed information about a specific entity using its Ucode.

    A Ucode (Universal Code) is a unique identifier for geographic entities
    within the GeoRepo system, typically in the format "ISO3_LEVEL_NAME".

    Args:
        ucode (str): The unique code identifier for the entity.
        geom (str, optional): Geometry inclusion level. Options:
            - "no_geom": No geometry data
            - "centroid": Only centroid points
            - "full_geom": Complete boundary geometries
            Defaults to "full_geom".
        format (str, optional): Response format ("json" or "geojson").
            Defaults to "geojson".

    Returns:
        dict: JSON/GeoJSON response containing entity details including
            geometry, properties, administrative level, and metadata.

    Raises:
        requests.HTTPError: If the API request fails or ucode is invalid.
    """
    endpoint = f"{self.base_url}/search/entity/ucode/{ucode}/"
    params = {"geom": geom, "format": format}
    response = self._make_request("GET", endpoint, params=params)
    return response.json()
get_vector_tiles_url(view_info)

Generate an authenticated URL for accessing vector tiles.

Vector tiles are used for efficient map rendering and can be consumed by mapping libraries like Mapbox GL JS or OpenLayers.

Parameters:

Name Type Description Default
view_info dict

Dictionary containing view information that must include a 'vector_tiles' key with the base vector tiles URL.

required

Returns:

Name Type Description
str

Fully authenticated vector tiles URL with API key and user email parameters appended for access control.

Raises:

Type Description
ValueError

If 'vector_tiles' key is not found in view_info.

Source code in gigaspatial/handlers/unicef_georepo.py
def get_vector_tiles_url(self, view_info):
    """
    Generate an authenticated URL for accessing vector tiles.

    Vector tiles are used for efficient map rendering and can be consumed
    by mapping libraries like Mapbox GL JS or OpenLayers.

    Args:
        view_info (dict): Dictionary containing view information that must
            include a 'vector_tiles' key with the base vector tiles URL.

    Returns:
        str: Fully authenticated vector tiles URL with API key and user email
            parameters appended for access control.

    Raises:
        ValueError: If 'vector_tiles' key is not found in view_info.
    """
    if "vector_tiles" not in view_info:
        raise ValueError("Vector tiles URL not found in view information")

    vector_tiles_url = view_info["vector_tiles"]

    # Parse out the timestamp parameter if it exists
    if "?t=" in vector_tiles_url:
        base_url, timestamp = vector_tiles_url.split("?t=")
        return f"{base_url}?t={timestamp}&token={self.api_key}&georepo_user_key={self.email}"
    else:
        return (
            f"{vector_tiles_url}?token={self.api_key}&georepo_user_key={self.email}"
        )
list_datasets_by_module(module_uuid)

List all datasets within a specific module.

A dataset represents a collection of related geographic entities, such as administrative boundaries for a specific country or region.

Parameters:

Name Type Description Default
module_uuid str

The UUID of the module to query.

required

Returns:

Name Type Description
dict

JSON response containing a list of datasets with their metadata. Each dataset includes 'uuid', 'name', 'description', creation date, etc.

Raises:

Type Description
HTTPError

If the API request fails or module_uuid is invalid.

Source code in gigaspatial/handlers/unicef_georepo.py
def list_datasets_by_module(self, module_uuid):
    """
    List all datasets within a specific module.

    A dataset represents a collection of related geographic entities,
    such as administrative boundaries for a specific country or region.

    Args:
        module_uuid (str): The UUID of the module to query.

    Returns:
        dict: JSON response containing a list of datasets with their metadata.
            Each dataset includes 'uuid', 'name', 'description', creation date, etc.

    Raises:
        requests.HTTPError: If the API request fails or module_uuid is invalid.
    """
    endpoint = f"{self.base_url}/search/module/{module_uuid}/dataset/list/"
    response = self._make_request("GET", endpoint)
    return response.json()
list_entities_by_admin_level(view_uuid, admin_level, geom='no_geom', format='json', page=1, page_size=50)

List entities at a specific administrative level within a view.

Administrative levels typically follow a hierarchy: - Level 0: Countries - Level 1: States/Provinces/Regions - Level 2: Districts/Counties - Level 3: Sub-districts/Municipalities - And so on...

Parameters:

Name Type Description Default
view_uuid str

The UUID of the view to query.

required
admin_level int

The administrative level to retrieve (0, 1, 2, etc.).

required
geom str

Geometry inclusion level. Options: - "no_geom": No geometry data - "centroid": Only centroid points - "full_geom": Complete boundary geometries Defaults to "no_geom".

'no_geom'
format str

Response format ("json" or "geojson"). Defaults to "json".

'json'
page int

Page number for pagination. Defaults to 1.

1
page_size int

Number of results per page. Defaults to 50.

50

Returns:

Name Type Description
tuple

A tuple containing: - dict: JSON/GeoJSON response with entity data - dict: Metadata with pagination info (page, total_page, total_count)

Raises:

Type Description
HTTPError

If the API request fails or parameters are invalid.

Source code in gigaspatial/handlers/unicef_georepo.py
def list_entities_by_admin_level(
    self,
    view_uuid,
    admin_level,
    geom="no_geom",
    format="json",
    page=1,
    page_size=50,
):
    """
    List entities at a specific administrative level within a view.

    Administrative levels typically follow a hierarchy:
    - Level 0: Countries
    - Level 1: States/Provinces/Regions
    - Level 2: Districts/Counties
    - Level 3: Sub-districts/Municipalities
    - And so on...

    Args:
        view_uuid (str): The UUID of the view to query.
        admin_level (int): The administrative level to retrieve (0, 1, 2, etc.).
        geom (str, optional): Geometry inclusion level. Options:
            - "no_geom": No geometry data
            - "centroid": Only centroid points
            - "full_geom": Complete boundary geometries
            Defaults to "no_geom".
        format (str, optional): Response format ("json" or "geojson").
            Defaults to "json".
        page (int, optional): Page number for pagination. Defaults to 1.
        page_size (int, optional): Number of results per page. Defaults to 50.

    Returns:
        tuple: A tuple containing:
            - dict: JSON/GeoJSON response with entity data
            - dict: Metadata with pagination info (page, total_page, total_count)

    Raises:
        requests.HTTPError: If the API request fails or parameters are invalid.
    """
    endpoint = (
        f"{self.base_url}/search/view/{view_uuid}/entity/level/{admin_level}/"
    )
    params = {"page": page, "page_size": page_size, "geom": geom, "format": format}
    response = self._make_request("GET", endpoint, params=params)

    metadata = {
        "page": int(response.headers.get("page", 1)),
        "total_page": int(response.headers.get("total_page", 1)),
        "total_count": int(response.headers.get("count", 0)),
    }

    return response.json(), metadata
list_entity_children(view_uuid, entity_ucode, geom='no_geom', format='json')

List direct children of an entity in the administrative hierarchy.

For example, if given a country entity, this will return its states/provinces. If given a state entity, this will return its districts/counties.

Parameters:

Name Type Description Default
view_uuid str

The UUID of the view containing the entity.

required
entity_ucode str

The Ucode of the parent entity.

required
geom str

Geometry inclusion level. Options: - "no_geom": No geometry data - "centroid": Only centroid points - "full_geom": Complete boundary geometries Defaults to "no_geom".

'no_geom'
format str

Response format ("json" or "geojson"). Defaults to "json".

'json'

Returns:

Name Type Description
dict

JSON/GeoJSON response containing list of child entities with their properties and optional geometry data.

Raises:

Type Description
HTTPError

If the API request fails or parameters are invalid.

Source code in gigaspatial/handlers/unicef_georepo.py
def list_entity_children(
    self, view_uuid, entity_ucode, geom="no_geom", format="json"
):
    """
    List direct children of an entity in the administrative hierarchy.

    For example, if given a country entity, this will return its states/provinces.
    If given a state entity, this will return its districts/counties.

    Args:
        view_uuid (str): The UUID of the view containing the entity.
        entity_ucode (str): The Ucode of the parent entity.
        geom (str, optional): Geometry inclusion level. Options:
            - "no_geom": No geometry data
            - "centroid": Only centroid points
            - "full_geom": Complete boundary geometries
            Defaults to "no_geom".
        format (str, optional): Response format ("json" or "geojson").
            Defaults to "json".

    Returns:
        dict: JSON/GeoJSON response containing list of child entities
            with their properties and optional geometry data.

    Raises:
        requests.HTTPError: If the API request fails or parameters are invalid.
    """
    endpoint = (
        f"{self.base_url}/search/view/{view_uuid}/entity/{entity_ucode}/children/"
    )
    params = {"geom": geom, "format": format}
    response = self._make_request("GET", endpoint, params=params)
    return response.json()
list_modules()

List all available modules in GeoRepo.

A module is a top-level organizational unit that contains datasets. Examples include "Admin Boundaries", "Health Facilities", etc.

Returns:

Name Type Description
dict

JSON response containing a list of modules with their metadata. Each module includes 'uuid', 'name', 'description', and other properties.

Raises:

Type Description
HTTPError

If the API request fails.

Source code in gigaspatial/handlers/unicef_georepo.py
def list_modules(self):
    """
    List all available modules in GeoRepo.

    A module is a top-level organizational unit that contains datasets.
    Examples include "Admin Boundaries", "Health Facilities", etc.

    Returns:
        dict: JSON response containing a list of modules with their metadata.
            Each module includes 'uuid', 'name', 'description', and other properties.

    Raises:
        requests.HTTPError: If the API request fails.
    """
    endpoint = f"{self.base_url}/search/module/list/"
    response = self._make_request("GET", endpoint)
    return response.json()
list_views_by_dataset(dataset_uuid, page=1, page_size=50)

List views for a dataset with pagination support.

A view represents a specific version or subset of a dataset. Views may be tagged as 'latest' or represent different time periods.

Parameters:

Name Type Description Default
dataset_uuid str

The UUID of the dataset to query.

required
page int

Page number for pagination. Defaults to 1.

1
page_size int

Number of results per page. Defaults to 50.

50

Returns:

Name Type Description
dict

JSON response containing paginated list of views with metadata. Includes 'results', 'total_page', 'current_page', and 'count' fields. Each view includes 'uuid', 'name', 'tags', and other properties.

Raises:

Type Description
HTTPError

If the API request fails or dataset_uuid is invalid.

Source code in gigaspatial/handlers/unicef_georepo.py
def list_views_by_dataset(self, dataset_uuid, page=1, page_size=50):
    """
    List views for a dataset with pagination support.

    A view represents a specific version or subset of a dataset.
    Views may be tagged as 'latest' or represent different time periods.

    Args:
        dataset_uuid (str): The UUID of the dataset to query.
        page (int, optional): Page number for pagination. Defaults to 1.
        page_size (int, optional): Number of results per page. Defaults to 50.

    Returns:
        dict: JSON response containing paginated list of views with metadata.
            Includes 'results', 'total_page', 'current_page', and 'count' fields.
            Each view includes 'uuid', 'name', 'tags', and other properties.

    Raises:
        requests.HTTPError: If the API request fails or dataset_uuid is invalid.
    """
    endpoint = f"{self.base_url}/search/dataset/{dataset_uuid}/view/list/"
    params = {"page": page, "page_size": page_size}
    response = self._make_request("GET", endpoint, params=params)
    return response.json()
search_entities_by_name(view_uuid, name, page=1, page_size=50)

Search for entities by name using fuzzy matching.

This performs a similarity-based search to find entities whose names match or are similar to the provided search term.

Parameters:

Name Type Description Default
view_uuid str

The UUID of the view to search within.

required
name str

The name or partial name to search for.

required
page int

Page number for pagination. Defaults to 1.

1
page_size int

Number of results per page. Defaults to 50.

50

Returns:

Name Type Description
dict

JSON response containing paginated search results with matching entities and their similarity scores.

Raises:

Type Description
HTTPError

If the API request fails or parameters are invalid.

Source code in gigaspatial/handlers/unicef_georepo.py
def search_entities_by_name(self, view_uuid, name, page=1, page_size=50):
    """
    Search for entities by name using fuzzy matching.

    This performs a similarity-based search to find entities whose names
    match or are similar to the provided search term.

    Args:
        view_uuid (str): The UUID of the view to search within.
        name (str): The name or partial name to search for.
        page (int, optional): Page number for pagination. Defaults to 1.
        page_size (int, optional): Number of results per page. Defaults to 50.

    Returns:
        dict: JSON response containing paginated search results with
            matching entities and their similarity scores.

    Raises:
        requests.HTTPError: If the API request fails or parameters are invalid.
    """
    endpoint = f"{self.base_url}/search/view/{view_uuid}/entity/{name}/"
    params = {"page": page, "page_size": page_size}
    response = self._make_request("GET", endpoint, params=params)
    return response.json()

find_admin_boundaries_module()

Find and return the UUID of the Admin Boundaries module.

This is a convenience function that searches through all available modules to locate the one named "Admin Boundaries", which typically contains administrative boundary datasets.

Returns:

Name Type Description
str

The UUID of the Admin Boundaries module.

Raises:

Type Description
ValueError

If the Admin Boundaries module is not found.

Source code in gigaspatial/handlers/unicef_georepo.py
def find_admin_boundaries_module():
    """
    Find and return the UUID of the Admin Boundaries module.

    This is a convenience function that searches through all available modules
    to locate the one named "Admin Boundaries", which typically contains
    administrative boundary datasets.

    Returns:
        str: The UUID of the Admin Boundaries module.

    Raises:
        ValueError: If the Admin Boundaries module is not found.
    """
    client = GeoRepoClient()
    modules = client.list_modules()

    for module in modules.get("results", []):
        if module["name"] == "Admin Boundaries":
            return module["uuid"]

    raise ValueError("Admin Boundaries module not found")

get_country_boundaries_by_iso3(iso3_code, client=None, admin_level=None)

Get administrative boundaries for a specific country using its ISO3 code.

This function provides a high-level interface to retrieve country boundaries by automatically finding the appropriate module, dataset, and view, then fetching the requested administrative boundaries.

The function will: 1. Find the Admin Boundaries module 2. Locate a global dataset within that module 3. Find the latest view of that dataset 4. Search for the country using the ISO3 code 5. Look for a country-specific view if available 6. Retrieve boundaries at the specified admin level or all levels

Parameters:

Name Type Description Default
iso3_code str

The ISO3 country code (e.g., 'USA', 'KEN', 'BRA').

required
admin_level int

The administrative level to retrieve: - 0: Country level - 1: State/Province/Region level - 2: District/County level - 3: Sub-district/Municipality level - etc. If None, retrieves all available administrative levels.

None

Returns:

Name Type Description
dict

A GeoJSON FeatureCollection containing the requested boundaries. Each feature includes geometry and properties for the administrative unit.

Raises:

Type Description
ValueError

If the Admin Boundaries module, datasets, views, or country cannot be found.

HTTPError

If any API requests fail.

Note

This function may make multiple API calls and can take some time for countries with many administrative units. It handles pagination automatically and attempts to use country-specific views when available for better performance.

Example
Get all administrative levels for Kenya

boundaries = get_country_boundaries_by_iso3('KEN')

Get only province-level boundaries for Kenya

provinces = get_country_boundaries_by_iso3('KEN', admin_level=1)

Source code in gigaspatial/handlers/unicef_georepo.py
def get_country_boundaries_by_iso3(
    iso3_code, client: GeoRepoClient = None, admin_level=None
):
    """
    Get administrative boundaries for a specific country using its ISO3 code.

    This function provides a high-level interface to retrieve country boundaries
    by automatically finding the appropriate module, dataset, and view, then
    fetching the requested administrative boundaries.

    The function will:
    1. Find the Admin Boundaries module
    2. Locate a global dataset within that module
    3. Find the latest view of that dataset
    4. Search for the country using the ISO3 code
    5. Look for a country-specific view if available
    6. Retrieve boundaries at the specified admin level or all levels

    Args:
        iso3_code (str): The ISO3 country code (e.g., 'USA', 'KEN', 'BRA').
        admin_level (int, optional): The administrative level to retrieve:
            - 0: Country level
            - 1: State/Province/Region level
            - 2: District/County level
            - 3: Sub-district/Municipality level
            - etc.
            If None, retrieves all available administrative levels.

    Returns:
        dict: A GeoJSON FeatureCollection containing the requested boundaries.
            Each feature includes geometry and properties for the administrative unit.

    Raises:
        ValueError: If the Admin Boundaries module, datasets, views, or country
            cannot be found.
        requests.HTTPError: If any API requests fail.

    Note:
        This function may make multiple API calls and can take some time for
        countries with many administrative units. It handles pagination
        automatically and attempts to use country-specific views when available
        for better performance.

    Example:
        >>> # Get all administrative levels for Kenya
        >>> boundaries = get_country_boundaries_by_iso3('KEN')
        >>>
        >>> # Get only province-level boundaries for Kenya
        >>> provinces = get_country_boundaries_by_iso3('KEN', admin_level=1)
    """
    client = client or GeoRepoClient()

    client.logger.info("Finding Admin Boundaries module...")
    modules = client.list_modules()
    admin_module_uuid = None

    for module in modules.get("results", []):
        if "Admin Boundaries" in module["name"]:
            admin_module_uuid = module["uuid"]
            client.logger.info(
                f"Found Admin Boundaries module: {module['name']} ({admin_module_uuid})"
            )
            break

    if not admin_module_uuid:
        raise ValueError("Admin Boundaries module not found")

    client.logger.info(f"Finding datasets in the Admin Boundaries module...")
    datasets = client.list_datasets_by_module(admin_module_uuid)
    global_dataset_uuid = None

    for dataset in datasets.get("results", []):
        if any(keyword in dataset["name"].lower() for keyword in ["global"]):
            global_dataset_uuid = dataset["uuid"]
            client.logger.info(
                f"Found global dataset: {dataset['name']} ({global_dataset_uuid})"
            )
            break

    if not global_dataset_uuid:
        if datasets.get("results"):
            global_dataset_uuid = datasets["results"][0]["uuid"]
            client.logger.info(
                f"Using first available dataset: {datasets['results'][0]['name']} ({global_dataset_uuid})"
            )
        else:
            raise ValueError("No datasets found in the Admin Boundaries module")

    client.logger.info(f"Finding views in the dataset...")
    views = client.list_views_by_dataset(global_dataset_uuid)
    latest_view_uuid = None

    for view in views.get("results", []):
        if "tags" in view and "latest" in view["tags"]:
            latest_view_uuid = view["uuid"]
            client.logger.info(
                f"Found latest view: {view['name']} ({latest_view_uuid})"
            )
            break

    if not latest_view_uuid:
        if views.get("results"):
            latest_view_uuid = views["results"][0]["uuid"]
            client.logger.info(
                f"Using first available view: {views['results'][0]['name']} ({latest_view_uuid})"
            )
        else:
            raise ValueError("No views found in the dataset")

    # Search for the country by ISO3 code
    client.logger.info(f"Searching for country with ISO3 code: {iso3_code}...")
    country_entity = client.find_country_by_iso3(latest_view_uuid, iso3_code)

    if not country_entity:
        raise ValueError(f"Country with ISO3 code '{iso3_code}' not found")

    country_ucode = country_entity["ucode"]
    country_name = country_entity["name"]
    client.logger.info(f"Found country: {country_name} (Ucode: {country_ucode})")

    # Search for country-specific view
    client.logger.info(f"Checking for country-specific view...")
    country_view_uuid = None
    all_views = []

    # Need to fetch all pages of views
    page = 1
    while True:
        views_page = client.list_views_by_dataset(global_dataset_uuid, page=page)
        all_views.extend(views_page.get("results", []))
        if page >= views_page.get("total_page", 1):
            break
        page += 1

    # Look for a view specifically for this country
    for view in all_views:
        if country_name.lower() == view["name"].split(" (")[
            0
        ].lower() and "latest" in view.get("tags", []):
            country_view_uuid = view["uuid"]
            client.logger.info(
                f"Found country-specific view: {view['name']} ({country_view_uuid})"
            )
            break

    # Get boundaries based on admin level
    if country_view_uuid:
        client.logger.info(country_view_uuid)
        # If we found a view specific to this country, use it
        client.logger.info(f"Getting admin boundaries from country-specific view...")
        if admin_level is not None:
            client.logger.info(f"Fetching admin level {admin_level} boundaries...")

            # Handle pagination for large datasets
            all_features = []
            page = 1
            while True:
                result, meta = client.list_entities_by_admin_level(
                    country_view_uuid,
                    admin_level,
                    geom="full_geom",
                    format="geojson",
                    page=page,
                    page_size=50,
                )

                # Add features to our collection
                if "features" in result:
                    all_features.extend(result["features"])
                elif "results" in result:
                    # Convert entities to GeoJSON features if needed
                    for entity in result["results"]:
                        if "geometry" in entity:
                            feature = {
                                "type": "Feature",
                                "properties": {
                                    k: v for k, v in entity.items() if k != "geometry"
                                },
                                "geometry": entity["geometry"],
                            }
                            all_features.append(feature)

                # Check if there are more pages
                if page >= meta["total_page"]:
                    break

                page += 1

            boundaries = {"type": "FeatureCollection", "features": all_features}
        else:
            # Get all admin levels by fetching each level separately
            boundaries = {"type": "FeatureCollection", "features": []}

            # Get dataset details to find available admin levels
            dataset_details = client.get_dataset_details(global_dataset_uuid)
            max_level = 0

            for level_info in dataset_details.get("dataset_levels", []):
                if isinstance(level_info.get("level"), int):
                    max_level = max(max_level, level_info["level"])

            client.logger.info(f"Dataset has admin levels from 0 to {max_level}")

            # Fetch each admin level
            for level in range(max_level + 1):
                client.logger.info(f"Fetching admin level {level}...")
                try:
                    level_data, meta = client.list_entities_by_admin_level(
                        country_view_uuid, level, geom="full_geom", format="geojson"
                    )

                    if "features" in level_data:
                        boundaries["features"].extend(level_data["features"])
                    elif "results" in level_data:
                        # Process each page of results
                        page = 1
                        while True:
                            result, meta = client.list_entities_by_admin_level(
                                country_view_uuid,
                                level,
                                geom="full_geom",
                                format="geojson",
                                page=page,
                            )

                            if "features" in result:
                                boundaries["features"].extend(result["features"])

                            # Check for more pages
                            if page >= meta["total_page"]:
                                break

                            page += 1

                except Exception as e:
                    client.logger.warning(f"Error fetching admin level {level}: {e}")
    else:
        # Use the global view with filtering
        client.logger.info(f"Using global view and filtering by country...")

        # Function to recursively get all descendants
        def get_all_children(
            parent_ucode, view_uuid, level=1, max_depth=5, admin_level_filter=None
        ):
            """
            Recursively retrieve all child entities of a parent entity.

            Args:
                parent_ucode (str): The Ucode of the parent entity.
                view_uuid (str): The UUID of the view to query.
                level (int): Current recursion level (for depth limiting).
                max_depth (int): Maximum recursion depth to prevent infinite loops.
                admin_level_filter (int, optional): If specified, only return
                    entities at this specific administrative level.

            Returns:
                list: List of GeoJSON features for all child entities.
            """
            if level > max_depth:
                return []

            try:
                children = client.list_entity_children(view_uuid, parent_ucode)
                features = []

                for child in children.get("results", []):
                    # Skip if we're filtering by admin level and this doesn't match
                    if (
                        admin_level_filter is not None
                        and child.get("admin_level") != admin_level_filter
                    ):
                        continue

                    # Get the child with full geometry
                    child_entity = client.get_entity_by_ucode(child["ucode"])
                    if "features" in child_entity:
                        features.extend(child_entity["features"])

                    # Recursively get grandchildren if not filtering by admin level
                    if admin_level_filter is None:
                        features.extend(
                            get_all_children(
                                child["ucode"], view_uuid, level + 1, max_depth
                            )
                        )

                return features
            except Exception as e:
                client.logger.warning(f"Error getting children for {parent_ucode}: {e}")
                return []

        # Start with the country boundaries
        boundaries = {"type": "FeatureCollection", "features": []}

        # If admin_level is 0, just get the country entity
        if admin_level == 0:
            country_entity = client.get_entity_by_ucode(country_ucode)
            if "features" in country_entity:
                boundaries["features"].extend(country_entity["features"])
        # If specific admin level requested, get all entities at that level
        elif admin_level is not None:
            children_features = get_all_children(
                country_ucode,
                latest_view_uuid,
                max_depth=admin_level + 1,
                admin_level_filter=admin_level,
            )
            boundaries["features"].extend(children_features)
        # If no admin_level specified, get all levels
        else:
            # Start with the country entity
            country_entity = client.get_entity_by_ucode(country_ucode)
            if "features" in country_entity:
                boundaries["features"].extend(country_entity["features"])

            # Get all descendants
            children_features = get_all_children(
                country_ucode, latest_view_uuid, max_depth=5
            )
            boundaries["features"].extend(children_features)

    return boundaries

worldpop

WPPopulationConfig dataclass

Bases: BaseHandlerConfig

Source code in gigaspatial/handlers/worldpop.py
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class WPPopulationConfig(BaseHandlerConfig):

    client = WorldPopRestClient()

    AVAILABLE_YEARS: List = Field(default=np.append(np.arange(2000, 2021), 2024))
    AVAILABLE_RESOLUTIONS: List = Field(default=[100, 1000])

    # user config
    base_path: Path = Field(default=global_config.get_path("worldpop", "bronze"))
    project: Literal["pop", "age_structures"] = Field(...)
    year: int = Field(...)
    resolution: int = Field(...)
    un_adjusted: bool = Field(...)
    constrained: bool = Field(...)
    school_age: bool = Field(...)

    def _filter_age_sex_paths(self, paths: List[Path], filters: Dict) -> List[Path]:
        """
        Helper to filter a list of WorldPop age_structures paths based on sex, age, and education level filters.
        """
        sex_filters = filters.get("sex_filters")
        level_filters = filters.get("level_filters")  # For school_age=True
        ages_filter = filters.get("ages_filter")
        min_age = filters.get("min_age")
        max_age = filters.get("max_age")

        filtered_paths: List[Path] = []

        for p in paths:
            # Expected basename patterns:
            # - School age: DJI_M_SECONDARY_2020_1km.tif (ISO3_SEX_EDUCATIONLEVEL_YEAR_RES.tif)
            # - Non-school age: RWA_F_25_2020.tif (ISO3_SEX_AGE_YEAR.tif)
            bn = p.name
            stem = os.path.splitext(bn)[0]
            parts = stem.split("_")

            sex_val, age_val, education_level_val = None, None, None

            # Simple heuristic to differentiate school_age vs non-school_age filenames
            # Check for keywords related to education levels (assuming these are unique to school_age files)
            is_school_age_filename = any(
                lvl in stem.upper() for lvl in ["PRIMARY", "SECONDARY"]
            )

            if (
                is_school_age_filename
            ):  # Filenames like DJI_M_SECONDARY_2020_1km.tif, DJI_F_M_SECONDARY_2020_1km.tif
                if len(parts) >= 4:
                    # Determine sex_val and education_level_val based on patterns
                    if (
                        len(parts) > 2
                        and parts[1].upper() == "F"
                        and parts[2].upper() == "M"
                    ):
                        # Pattern: ISO3_F_M_EDUCATIONLEVEL_YEAR...
                        sex_val = "F_M"
                        if len(parts) > 3:  # Ensure index exists
                            education_level_val = parts[3].upper()
                    elif len(parts) > 1:
                        # Pattern: ISO3_SEX_EDUCATIONLEVEL_YEAR... (SEX is F or M)
                        sex_val = parts[1].upper()
                        if len(parts) > 2:  # Ensure index exists
                            education_level_val = parts[2].upper()
            else:  # Filenames like RWA_F_25_2020.tif
                if len(parts) >= 4:
                    sex_val = parts[1].upper()
                    try:
                        age_val = int(parts[2])
                    except (ValueError, IndexError):
                        age_val = None

            # --- Apply sex filter ---
            if sex_filters:
                # Explicit matching for all cases
                sex_ok = False
                if "F_M" in sex_filters and sex_val == "F_M":
                    sex_ok = True
                elif "F" in sex_filters and sex_val == "F":
                    sex_ok = True
                elif "M" in sex_filters and sex_val == "M":
                    sex_ok = True

                if not sex_ok:
                    continue
            elif self.project == "age_structures" and self.school_age:
                # Default for school_age=True with no sex filter: only load F_M
                if sex_val != "F_M":
                    continue

            # --- Apply education level filter (only relevant for school_age filenames) ---
            if level_filters and is_school_age_filename:
                if (
                    education_level_val is None
                    or education_level_val not in level_filters
                ):
                    continue

            # --- Apply age filters (only relevant for non-school_age filenames) ---
            if (
                ages_filter is not None or min_age is not None or max_age is not None
            ) and not is_school_age_filename:
                if age_val is not None:
                    if ages_filter is not None and age_val not in ages_filter:
                        continue
                    if min_age is not None and age_val < int(min_age):
                        continue
                    if max_age is not None and age_val > int(max_age):
                        continue
                else:  # If age filters are specified but age_val couldn't be parsed
                    self.logger.warning(
                        f"Could not parse age from filename {p.name} but age filters were applied. Skipping file."
                    )
                    continue

            filtered_paths.append(p)

        return filtered_paths

    @field_validator("year")
    def validate_year(cls, value: str) -> int:
        if value in cls.AVAILABLE_YEARS:
            return value
        raise ValueError(
            f"No datasets found for the provided year: {value}\nAvailable years are: {cls.AVAILABLE_YEARS}"
        )

    @field_validator("resolution")
    def validate_resolution(cls, value: str) -> int:
        if value in cls.AVAILABLE_RESOLUTIONS:
            return value
        raise ValueError(
            f"No datasets found for the provided resolution: {value}\nAvailable resolutions are: {cls.AVAILABLE_RESOLUTIONS}"
        )

    @model_validator(mode="after")
    def validate_configuration(self):
        """
        Validate that the configuration is valid based on dataset availability constraints.

        Specific rules:
        - For age_structures:
            - School age data is only available for 2020 at 1km resolution.
            - Non-school age data is only available at 100m resolution.
            - Unconstrained, non-school age data is only available without UN adjustment.
            - Constrained, non-school age data with UN adjustment is only available for 2020.
            - Constrained, non-school age data without UN adjustment is only available for 2020 and 2024.
        - For pop:
            - 2024 data is only available at 100m resolution and without UN adjustment.
            - Constrained data (other than 2024) is only available for 2020 at 100m resolution.
            - Unconstrained data at 100m or 1km is available for other years, with or without UN adjustment.
        """

        if self.project == "age_structures":

            if self.school_age:
                if self.resolution == 100:
                    self.logger.warning(
                        "School age population datasets are only available at 1km `resolution`, resolution is set as 1000"
                    )
                    self.resolution = 1000

                if self.year != 2020:
                    self.logger.warning(
                        "School age population datasets are only available for 2020, `year` is set as 2020"
                    )
                    self.year = 2020

                if self.un_adjusted:
                    self.logger.warning(
                        "School age population datasets are only available without UN adjustment, `un_adjusted` is set as False"
                    )
                    self.un_adjusted = False

                if self.constrained:
                    self.logger.warning(
                        "School age population datasets are only available unconstrained, `constrained` is set as False"
                    )
                    self.constrained = False

                self.dataset_category = "sapya1km"
            else:
                if self.resolution == 1000:
                    self.logger.warning(
                        "Age structures datasets are only available at 100m resolution, `resolution` is set as 100"
                    )
                    self.resolution = 100

                if not self.constrained:
                    if self.un_adjusted:
                        self.logger.warning(
                            "Age structures unconstrained datasets are only available without UN adjustment, `un_adjusted` is set as False"
                        )
                        self.un_adjusted = False

                    self.dataset_category = (
                        "G2_UC_Age_2024_100m" if self.year == 2024 else "aswpgp"
                    )
                else:
                    if self.un_adjusted:
                        if self.year != 2020:
                            self.logger.warning(
                                "Age structures constrained datasets with UN adjustment are only available for 2020, `year` is set as 2020"
                            )
                            self.year = 2020
                        self.dataset_category = "ascicua_2020"
                    else:
                        if self.year == 2024:
                            self.dataset_category = "G2_CN_Age_2024_100m"
                        elif self.year == 2020:
                            self.dataset_category = "ascic_2020"
                        else:
                            raise ValueError(
                                "Age structures constrained datasets without UN adjustment are only available for 2020 and 2024, please set `year` to one of the available options: 2020, 2024"
                            )

        elif self.project == "pop":

            if self.school_age:
                raise ValueError(
                    f"""
                    Received unexpected value of `{self.school_age}` for project: `{self.project}`.
                    For school age population datasets, please set project as `age_structures`.
                    """
                )

            if self.year == 2024:
                if self.resolution == 1000:
                    self.logger.warning(
                        "2024 datasets are only available at 100m resolution, `resolution` is set as 100m"
                    )
                    self.resolution = 100
                if self.un_adjusted:
                    self.logger.warning(
                        "2024 datasets are only available without UN adjustment, `un_adjusted` is set as False"
                    )
                    self.un_adjusted = False

                self.dataset_category = (
                    "G2_CN_POP_2024_100m" if self.constrained else "G2_UC_POP_2024_100m"
                )
            else:
                if self.constrained:
                    if self.year != 2020:
                        self.logger.warning(
                            "Population constrained datasets are only available for 2020, `year` is set as 2020"
                        )
                        self.year = 2020

                    if self.resolution != 100:
                        self.logger.warning(
                            "Population constrained datasets are only available at 100m resolution, `resolution` is set as 100"
                        )
                        self.resolution = 100

                    self.dataset_category = (
                        "cic2020_UNadj_100m" if self.un_adjusted else "cic2020_100m"
                    )
                else:
                    if self.resolution == 100:
                        self.dataset_category = (
                            f"wpgp{'unadj' if self.un_adjusted else ''}"
                        )
                    else:
                        self.dataset_category = (
                            "wpic1km" if not self.un_adjusted else "wpicuadj1km"
                        )

    def get_relevant_data_units_by_geometry(
        self, geometry: str, **kwargs
    ) -> List[Dict[str, Any]]:
        datasets = self.client.search_datasets(
            self.project, self.dataset_category, geometry, self.year
        )

        if not datasets:
            raise RuntimeError(
                f"No WorldPop datasets found for country: {geometry}, "
                f"project: {self.project}, category: {self.dataset_category}, year: {self.year}. "
                "Please check the configuration parameters."
            )

        files = [
            file
            for file in datasets[0].get("files", [])
            if ((self.dataset_category == "sapya1km") or file.endswith(".tif"))
        ]

        return files

    def get_data_unit_path(self, unit: str, **kwargs) -> Path:
        """
        Given a WP file url, return the corresponding path.
        """
        return self.base_path / unit.split("GIS/")[1]

    def get_data_unit_paths(self, units: Union[List[str], str], **kwargs) -> list:
        """
        Given WP file url(s), return the corresponding local file paths.

        - For school_age age_structures (zip resources), if extracted .tif files are present
        in the target directory, return those; otherwise, return the zip path(s) to allow
        the downloader to fetch and extract them.
        - For non-school_age age_structures (individual .tif URLs), you can filter by sex and age
        using kwargs: sex, ages, min_age, max_age.
        """
        if not isinstance(units, list):
            units = [units]

        # Extract optional filters
        sex = kwargs.get("sex")
        education_level = kwargs.get("education_level") or kwargs.get("level")
        ages_filter = kwargs.get("ages")
        min_age = kwargs.get("min_age")
        max_age = kwargs.get("max_age")

        def _to_set(v):
            if v is None:
                return None
            if isinstance(v, (list, tuple, set)):
                return {str(x).upper() for x in v}
            return {str(v).upper()}

        sex_filters = _to_set(sex)
        level_filters = _to_set(education_level)

        # 1) School-age branch (zip → extracted tifs)
        if self.project == "age_structures" and self.school_age:
            resolved_paths: List[Path] = []
            for url in units:
                output_dir = self.get_data_unit_path(url).parent

                if self.data_store.is_dir(str(output_dir)):
                    try:
                        all_extracted_tifs = [
                            Path(f)
                            for f in self.data_store.list_files(str(output_dir))
                            if f.lower().endswith(".tif")
                        ]
                        # Apply filters to extracted tifs
                        filtered_tifs = self._filter_age_sex_paths(
                            all_extracted_tifs,
                            {
                                "sex_filters": sex_filters,
                                "level_filters": level_filters,
                            },
                        )
                        resolved_paths.extend(filtered_tifs)
                    except Exception:
                        resolved_paths.append(self.get_data_unit_path(url))  # Fallback
                else:
                    resolved_paths.append(
                        self.get_data_unit_path(url)
                    )  # Fallback if not extracted yet

            return resolved_paths

        # 2) Non-school_age age_structures (individual tif URLs) with DEFERRED sex/age filters
        if self.project == "age_structures" and not self.school_age:
            # Store filters in a way that the reader can access them if needed
            self._temp_age_sex_filters = {
                "sex_filters": sex_filters,
                "ages_filter": ages_filter,
                "min_age": min_age,
                "max_age": max_age,
            }
            # Here, we don't apply the filters yet. We return all potential paths.
            # The actual filtering will happen in the reader or during TifProcessor loading.
            return [self.get_data_unit_path(unit) for unit in units]

        # Default behavior for all other datasets
        return [self.get_data_unit_path(unit) for unit in units]

    def extract_search_geometry(self, source, **kwargs):
        """
        Override the method since geometry extraction does not apply.
        Returns country iso3 for dataset search
        """
        if not isinstance(source, str):
            raise ValueError(
                f"Unsupported source type: {type(source)}"
                "Please use country-based (str) filtering."
            )

        return pycountry.countries.lookup(source).alpha_3

    def __repr__(self) -> str:

        return (
            f"WPPopulationConfig(",
            f"project={self.project}, "
            f"year={self.year}, "
            f"resolution={self.resolution}, "
            f"un_adjusted={self.un_adjusted}, "
            f"constrained={self.constrained}, "
            f"school_age={self.school_age}, "
            f")",
        )
extract_search_geometry(source, **kwargs)

Override the method since geometry extraction does not apply. Returns country iso3 for dataset search

Source code in gigaspatial/handlers/worldpop.py
def extract_search_geometry(self, source, **kwargs):
    """
    Override the method since geometry extraction does not apply.
    Returns country iso3 for dataset search
    """
    if not isinstance(source, str):
        raise ValueError(
            f"Unsupported source type: {type(source)}"
            "Please use country-based (str) filtering."
        )

    return pycountry.countries.lookup(source).alpha_3
get_data_unit_path(unit, **kwargs)

Given a WP file url, return the corresponding path.

Source code in gigaspatial/handlers/worldpop.py
def get_data_unit_path(self, unit: str, **kwargs) -> Path:
    """
    Given a WP file url, return the corresponding path.
    """
    return self.base_path / unit.split("GIS/")[1]
get_data_unit_paths(units, **kwargs)

Given WP file url(s), return the corresponding local file paths.

  • For school_age age_structures (zip resources), if extracted .tif files are present in the target directory, return those; otherwise, return the zip path(s) to allow the downloader to fetch and extract them.
  • For non-school_age age_structures (individual .tif URLs), you can filter by sex and age using kwargs: sex, ages, min_age, max_age.
Source code in gigaspatial/handlers/worldpop.py
def get_data_unit_paths(self, units: Union[List[str], str], **kwargs) -> list:
    """
    Given WP file url(s), return the corresponding local file paths.

    - For school_age age_structures (zip resources), if extracted .tif files are present
    in the target directory, return those; otherwise, return the zip path(s) to allow
    the downloader to fetch and extract them.
    - For non-school_age age_structures (individual .tif URLs), you can filter by sex and age
    using kwargs: sex, ages, min_age, max_age.
    """
    if not isinstance(units, list):
        units = [units]

    # Extract optional filters
    sex = kwargs.get("sex")
    education_level = kwargs.get("education_level") or kwargs.get("level")
    ages_filter = kwargs.get("ages")
    min_age = kwargs.get("min_age")
    max_age = kwargs.get("max_age")

    def _to_set(v):
        if v is None:
            return None
        if isinstance(v, (list, tuple, set)):
            return {str(x).upper() for x in v}
        return {str(v).upper()}

    sex_filters = _to_set(sex)
    level_filters = _to_set(education_level)

    # 1) School-age branch (zip → extracted tifs)
    if self.project == "age_structures" and self.school_age:
        resolved_paths: List[Path] = []
        for url in units:
            output_dir = self.get_data_unit_path(url).parent

            if self.data_store.is_dir(str(output_dir)):
                try:
                    all_extracted_tifs = [
                        Path(f)
                        for f in self.data_store.list_files(str(output_dir))
                        if f.lower().endswith(".tif")
                    ]
                    # Apply filters to extracted tifs
                    filtered_tifs = self._filter_age_sex_paths(
                        all_extracted_tifs,
                        {
                            "sex_filters": sex_filters,
                            "level_filters": level_filters,
                        },
                    )
                    resolved_paths.extend(filtered_tifs)
                except Exception:
                    resolved_paths.append(self.get_data_unit_path(url))  # Fallback
            else:
                resolved_paths.append(
                    self.get_data_unit_path(url)
                )  # Fallback if not extracted yet

        return resolved_paths

    # 2) Non-school_age age_structures (individual tif URLs) with DEFERRED sex/age filters
    if self.project == "age_structures" and not self.school_age:
        # Store filters in a way that the reader can access them if needed
        self._temp_age_sex_filters = {
            "sex_filters": sex_filters,
            "ages_filter": ages_filter,
            "min_age": min_age,
            "max_age": max_age,
        }
        # Here, we don't apply the filters yet. We return all potential paths.
        # The actual filtering will happen in the reader or during TifProcessor loading.
        return [self.get_data_unit_path(unit) for unit in units]

    # Default behavior for all other datasets
    return [self.get_data_unit_path(unit) for unit in units]
validate_configuration()

Validate that the configuration is valid based on dataset availability constraints.

Specific rules: - For age_structures: - School age data is only available for 2020 at 1km resolution. - Non-school age data is only available at 100m resolution. - Unconstrained, non-school age data is only available without UN adjustment. - Constrained, non-school age data with UN adjustment is only available for 2020. - Constrained, non-school age data without UN adjustment is only available for 2020 and 2024. - For pop: - 2024 data is only available at 100m resolution and without UN adjustment. - Constrained data (other than 2024) is only available for 2020 at 100m resolution. - Unconstrained data at 100m or 1km is available for other years, with or without UN adjustment.

Source code in gigaspatial/handlers/worldpop.py
@model_validator(mode="after")
def validate_configuration(self):
    """
    Validate that the configuration is valid based on dataset availability constraints.

    Specific rules:
    - For age_structures:
        - School age data is only available for 2020 at 1km resolution.
        - Non-school age data is only available at 100m resolution.
        - Unconstrained, non-school age data is only available without UN adjustment.
        - Constrained, non-school age data with UN adjustment is only available for 2020.
        - Constrained, non-school age data without UN adjustment is only available for 2020 and 2024.
    - For pop:
        - 2024 data is only available at 100m resolution and without UN adjustment.
        - Constrained data (other than 2024) is only available for 2020 at 100m resolution.
        - Unconstrained data at 100m or 1km is available for other years, with or without UN adjustment.
    """

    if self.project == "age_structures":

        if self.school_age:
            if self.resolution == 100:
                self.logger.warning(
                    "School age population datasets are only available at 1km `resolution`, resolution is set as 1000"
                )
                self.resolution = 1000

            if self.year != 2020:
                self.logger.warning(
                    "School age population datasets are only available for 2020, `year` is set as 2020"
                )
                self.year = 2020

            if self.un_adjusted:
                self.logger.warning(
                    "School age population datasets are only available without UN adjustment, `un_adjusted` is set as False"
                )
                self.un_adjusted = False

            if self.constrained:
                self.logger.warning(
                    "School age population datasets are only available unconstrained, `constrained` is set as False"
                )
                self.constrained = False

            self.dataset_category = "sapya1km"
        else:
            if self.resolution == 1000:
                self.logger.warning(
                    "Age structures datasets are only available at 100m resolution, `resolution` is set as 100"
                )
                self.resolution = 100

            if not self.constrained:
                if self.un_adjusted:
                    self.logger.warning(
                        "Age structures unconstrained datasets are only available without UN adjustment, `un_adjusted` is set as False"
                    )
                    self.un_adjusted = False

                self.dataset_category = (
                    "G2_UC_Age_2024_100m" if self.year == 2024 else "aswpgp"
                )
            else:
                if self.un_adjusted:
                    if self.year != 2020:
                        self.logger.warning(
                            "Age structures constrained datasets with UN adjustment are only available for 2020, `year` is set as 2020"
                        )
                        self.year = 2020
                    self.dataset_category = "ascicua_2020"
                else:
                    if self.year == 2024:
                        self.dataset_category = "G2_CN_Age_2024_100m"
                    elif self.year == 2020:
                        self.dataset_category = "ascic_2020"
                    else:
                        raise ValueError(
                            "Age structures constrained datasets without UN adjustment are only available for 2020 and 2024, please set `year` to one of the available options: 2020, 2024"
                        )

    elif self.project == "pop":

        if self.school_age:
            raise ValueError(
                f"""
                Received unexpected value of `{self.school_age}` for project: `{self.project}`.
                For school age population datasets, please set project as `age_structures`.
                """
            )

        if self.year == 2024:
            if self.resolution == 1000:
                self.logger.warning(
                    "2024 datasets are only available at 100m resolution, `resolution` is set as 100m"
                )
                self.resolution = 100
            if self.un_adjusted:
                self.logger.warning(
                    "2024 datasets are only available without UN adjustment, `un_adjusted` is set as False"
                )
                self.un_adjusted = False

            self.dataset_category = (
                "G2_CN_POP_2024_100m" if self.constrained else "G2_UC_POP_2024_100m"
            )
        else:
            if self.constrained:
                if self.year != 2020:
                    self.logger.warning(
                        "Population constrained datasets are only available for 2020, `year` is set as 2020"
                    )
                    self.year = 2020

                if self.resolution != 100:
                    self.logger.warning(
                        "Population constrained datasets are only available at 100m resolution, `resolution` is set as 100"
                    )
                    self.resolution = 100

                self.dataset_category = (
                    "cic2020_UNadj_100m" if self.un_adjusted else "cic2020_100m"
                )
            else:
                if self.resolution == 100:
                    self.dataset_category = (
                        f"wpgp{'unadj' if self.un_adjusted else ''}"
                    )
                else:
                    self.dataset_category = (
                        "wpic1km" if not self.un_adjusted else "wpicuadj1km"
                    )

WPPopulationDownloader

Bases: BaseHandlerDownloader

Source code in gigaspatial/handlers/worldpop.py
class WPPopulationDownloader(BaseHandlerDownloader):

    def __init__(
        self,
        config: Union[WPPopulationConfig, dict[str, Union[str, int]]],
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        """
        Initialize the downloader.

        Args:
            config: Configuration for the WorldPop dataset, either as a WPPopulationConfig object or a dictionary of parameters
            data_store: Optional data storage interface. If not provided, uses LocalDataStore.
            logger: Optional custom logger. If not provided, uses default logger.
        """
        config = (
            config
            if isinstance(config, WPPopulationConfig)
            else WPPopulationConfig(**config)
        )
        super().__init__(config=config, data_store=data_store, logger=logger)

    def download_data_unit(self, url, **kwargs):
        """Download data file for a url. If a zip, extract contained .tif files."""
        # If the resource is a zip (e.g., school age datasets), download to temp and extract .tif files
        if url.lower().endswith(".zip"):
            temp_downloaded_path: Optional[Path] = None
            try:
                with tempfile.NamedTemporaryFile(
                    delete=False, suffix=".zip"
                ) as temp_file:
                    temp_downloaded_path = Path(temp_file.name)
                    response = self.config.client.session.get(
                        url, stream=True, timeout=self.config.client.timeout
                    )
                    response.raise_for_status()

                    total_size = int(response.headers.get("content-length", 0))

                    with tqdm(
                        total=total_size,
                        unit="B",
                        unit_scale=True,
                        desc=f"Downloading {os.path.basename(temp_downloaded_path)}",
                    ) as pbar:
                        for chunk in response.iter_content(chunk_size=8192):
                            if chunk:
                                temp_file.write(chunk)
                                pbar.update(len(chunk))

                extracted_files: List[Path] = []
                output_dir = self.config.get_data_unit_path(url).parent
                with zipfile.ZipFile(str(temp_downloaded_path), "r") as zip_ref:
                    members = [
                        m for m in zip_ref.namelist() if m.lower().endswith(".tif")
                    ]
                    for member in members:
                        extracted_path = output_dir / Path(member).name
                        with zip_ref.open(member) as source:
                            file_content = source.read()
                            self.data_store.write_file(
                                str(extracted_path), file_content
                            )
                        extracted_files.append(extracted_path)
                        self.logger.info(f"Extracted {member} to {extracted_path}")

                return extracted_files

            except requests.RequestException as e:
                self.logger.error(f"Failed to download {url}: {e}")
                return None
            except zipfile.BadZipFile:
                self.logger.error("Downloaded file is not a valid zip archive.")
                return None
            except Exception as e:
                self.logger.error(f"Unexpected error processing zip for {url}: {e}")
                return None
            finally:
                if temp_downloaded_path and temp_downloaded_path.exists():
                    try:
                        temp_downloaded_path.unlink()
                    except OSError as e:
                        self.logger.warning(
                            f"Could not delete temporary file {temp_downloaded_path}: {e}"
                        )

        # Otherwise, download as a regular file (e.g., .tif)
        try:
            response = self.config.client.session.get(
                url, stream=True, timeout=self.config.client.timeout
            )
            response.raise_for_status()

            total_size = int(response.headers.get("content-length", 0))
            file_path = self.config.get_data_unit_path(url)

            with self.data_store.open(str(file_path), "wb") as file:
                with tqdm(
                    total=total_size,
                    unit="B",
                    unit_scale=True,
                    desc=f"Downloading {os.path.basename(file_path)}",
                ) as pbar:
                    for chunk in response.iter_content(chunk_size=8192):
                        if chunk:
                            file.write(chunk)
                            pbar.update(len(chunk))

            self.logger.info(f"Successfully downloaded: {file_path}")
            return file_path

        except requests.RequestException as e:
            self.logger.error(f"Failed to download {url}: {e}")
            return None
        except Exception as e:
            self.logger.error(f"Unexpected error downloading {url}: {e}")
            return None

    def download_data_units(
        self,
        urls: List[str],
        **kwargs,
    ) -> List[str]:
        """Download data files for multiple urls."""

        with multiprocessing.Pool(self.config.n_workers) as pool:
            download_func = functools.partial(self.download_data_unit)
            results = list(
                tqdm(
                    pool.imap(download_func, urls),
                    total=len(urls),
                    desc=f"Downloading data",
                )
            )

        # Flatten results and filter out None
        flattened: List[Path] = []
        for item in results:
            if item is None:
                continue
            if isinstance(item, list):
                flattened.extend(item)
            else:
                flattened.append(item)

        return flattened
__init__(config, data_store=None, logger=None)

Initialize the downloader.

Parameters:

Name Type Description Default
config Union[WPPopulationConfig, dict[str, Union[str, int]]]

Configuration for the WorldPop dataset, either as a WPPopulationConfig object or a dictionary of parameters

required
data_store Optional[DataStore]

Optional data storage interface. If not provided, uses LocalDataStore.

None
logger Optional[Logger]

Optional custom logger. If not provided, uses default logger.

None
Source code in gigaspatial/handlers/worldpop.py
def __init__(
    self,
    config: Union[WPPopulationConfig, dict[str, Union[str, int]]],
    data_store: Optional[DataStore] = None,
    logger: Optional[logging.Logger] = None,
):
    """
    Initialize the downloader.

    Args:
        config: Configuration for the WorldPop dataset, either as a WPPopulationConfig object or a dictionary of parameters
        data_store: Optional data storage interface. If not provided, uses LocalDataStore.
        logger: Optional custom logger. If not provided, uses default logger.
    """
    config = (
        config
        if isinstance(config, WPPopulationConfig)
        else WPPopulationConfig(**config)
    )
    super().__init__(config=config, data_store=data_store, logger=logger)
download_data_unit(url, **kwargs)

Download data file for a url. If a zip, extract contained .tif files.

Source code in gigaspatial/handlers/worldpop.py
def download_data_unit(self, url, **kwargs):
    """Download data file for a url. If a zip, extract contained .tif files."""
    # If the resource is a zip (e.g., school age datasets), download to temp and extract .tif files
    if url.lower().endswith(".zip"):
        temp_downloaded_path: Optional[Path] = None
        try:
            with tempfile.NamedTemporaryFile(
                delete=False, suffix=".zip"
            ) as temp_file:
                temp_downloaded_path = Path(temp_file.name)
                response = self.config.client.session.get(
                    url, stream=True, timeout=self.config.client.timeout
                )
                response.raise_for_status()

                total_size = int(response.headers.get("content-length", 0))

                with tqdm(
                    total=total_size,
                    unit="B",
                    unit_scale=True,
                    desc=f"Downloading {os.path.basename(temp_downloaded_path)}",
                ) as pbar:
                    for chunk in response.iter_content(chunk_size=8192):
                        if chunk:
                            temp_file.write(chunk)
                            pbar.update(len(chunk))

            extracted_files: List[Path] = []
            output_dir = self.config.get_data_unit_path(url).parent
            with zipfile.ZipFile(str(temp_downloaded_path), "r") as zip_ref:
                members = [
                    m for m in zip_ref.namelist() if m.lower().endswith(".tif")
                ]
                for member in members:
                    extracted_path = output_dir / Path(member).name
                    with zip_ref.open(member) as source:
                        file_content = source.read()
                        self.data_store.write_file(
                            str(extracted_path), file_content
                        )
                    extracted_files.append(extracted_path)
                    self.logger.info(f"Extracted {member} to {extracted_path}")

            return extracted_files

        except requests.RequestException as e:
            self.logger.error(f"Failed to download {url}: {e}")
            return None
        except zipfile.BadZipFile:
            self.logger.error("Downloaded file is not a valid zip archive.")
            return None
        except Exception as e:
            self.logger.error(f"Unexpected error processing zip for {url}: {e}")
            return None
        finally:
            if temp_downloaded_path and temp_downloaded_path.exists():
                try:
                    temp_downloaded_path.unlink()
                except OSError as e:
                    self.logger.warning(
                        f"Could not delete temporary file {temp_downloaded_path}: {e}"
                    )

    # Otherwise, download as a regular file (e.g., .tif)
    try:
        response = self.config.client.session.get(
            url, stream=True, timeout=self.config.client.timeout
        )
        response.raise_for_status()

        total_size = int(response.headers.get("content-length", 0))
        file_path = self.config.get_data_unit_path(url)

        with self.data_store.open(str(file_path), "wb") as file:
            with tqdm(
                total=total_size,
                unit="B",
                unit_scale=True,
                desc=f"Downloading {os.path.basename(file_path)}",
            ) as pbar:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        file.write(chunk)
                        pbar.update(len(chunk))

        self.logger.info(f"Successfully downloaded: {file_path}")
        return file_path

    except requests.RequestException as e:
        self.logger.error(f"Failed to download {url}: {e}")
        return None
    except Exception as e:
        self.logger.error(f"Unexpected error downloading {url}: {e}")
        return None
download_data_units(urls, **kwargs)

Download data files for multiple urls.

Source code in gigaspatial/handlers/worldpop.py
def download_data_units(
    self,
    urls: List[str],
    **kwargs,
) -> List[str]:
    """Download data files for multiple urls."""

    with multiprocessing.Pool(self.config.n_workers) as pool:
        download_func = functools.partial(self.download_data_unit)
        results = list(
            tqdm(
                pool.imap(download_func, urls),
                total=len(urls),
                desc=f"Downloading data",
            )
        )

    # Flatten results and filter out None
    flattened: List[Path] = []
    for item in results:
        if item is None:
            continue
        if isinstance(item, list):
            flattened.extend(item)
        else:
            flattened.append(item)

    return flattened

WPPopulationHandler

Bases: BaseHandler

Handler for WorldPop Populations datasets.

This class provides a unified interface for downloading and loading WP Population data. It manages the lifecycle of configuration, downloading, and reading components.

Source code in gigaspatial/handlers/worldpop.py
class WPPopulationHandler(BaseHandler):
    """
    Handler for WorldPop Populations datasets.

    This class provides a unified interface for downloading and loading WP Population data.
    It manages the lifecycle of configuration, downloading, and reading components.
    """

    def __init__(
        self,
        project: Literal["pop", "age_structures"] = "pop",
        year: int = 2020,
        resolution: int = 1000,
        un_adjusted: bool = True,
        constrained: bool = False,
        school_age: bool = False,
        config: Optional[WPPopulationConfig] = None,
        downloader: Optional[WPPopulationDownloader] = None,
        reader: Optional[WPPopulationReader] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
        **kwargs,
    ):
        self._project = project
        self._year = year
        self._resolution = resolution
        self._un_adjusted = un_adjusted
        self._constrained = constrained
        self._school_age = school_age
        super().__init__(
            config=config,
            downloader=downloader,
            reader=reader,
            data_store=data_store,
            logger=logger,
        )

    def create_config(
        self, data_store: DataStore, logger: logging.Logger, **kwargs
    ) -> WPPopulationConfig:
        """
        Create and return a WPPopulationConfig instance.

        Args:
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional configuration parameters

        Returns:
            Configured WPPopulationConfig instance
        """
        return WPPopulationConfig(
            project=self._project,
            year=self._year,
            resolution=self._resolution,
            un_adjusted=self._un_adjusted,
            constrained=self._constrained,
            school_age=self._school_age,
            data_store=data_store,
            logger=logger,
            **kwargs,
        )

    def create_downloader(
        self,
        config: WPPopulationConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> WPPopulationDownloader:
        """
        Create and return a WPPopulationDownloader instance.

        Args:
            config: The configuration object
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional downloader parameters

        Returns:
            Configured WPPopulationDownloader instance
        """
        return WPPopulationDownloader(
            config=config, data_store=data_store, logger=logger, **kwargs
        )

    def create_reader(
        self,
        config: WPPopulationConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> WPPopulationReader:
        """
        Create and return a WPPopulationReader instance.

        Args:
            config: The configuration object
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional reader parameters

        Returns:
            Configured WPPopulationReader instance
        """
        return WPPopulationReader(
            config=config, data_store=data_store, logger=logger, **kwargs
        )

    def load_into_dataframe(
        self,
        source: str,
        ensure_available: bool = True,
        **kwargs,
    ) -> pd.DataFrame:
        """
        Load GHSL data into a pandas DataFrame.

        Args:
            source: The data source specification
            ensure_available: If True, ensure data is downloaded before loading
            **kwargs: Additional parameters passed to load methods

        Returns:
            DataFrame containing the GHSL data
        """
        tif_processors = self.load_data(
            source=source, ensure_available=ensure_available, **kwargs
        )
        if isinstance(tif_processors, TifProcessor):
            return tif_processors.to_dataframe(**kwargs)

        return pd.concat(
            [tp.to_dataframe(**kwargs) for tp in tif_processors], ignore_index=True
        )

    def load_into_geodataframe(
        self,
        source: str,
        ensure_available: bool = True,
        **kwargs,
    ) -> gpd.GeoDataFrame:
        """
        Load GHSL data into a geopandas GeoDataFrame.

        Args:
            source: The data source specification
            ensure_available: If True, ensure data is downloaded before loading
            **kwargs: Additional parameters passed to load methods

        Returns:
            GeoDataFrame containing the GHSL data
        """
        tif_processors = self.load_data(
            source=source, ensure_available=ensure_available, **kwargs
        )
        if isinstance(tif_processors, TifProcessor):
            return tif_processors.to_geodataframe(**kwargs)

        return pd.concat(
            [tp.to_geodataframe(**kwargs) for tp in tif_processors], ignore_index=True
        )
create_config(data_store, logger, **kwargs)

Create and return a WPPopulationConfig instance.

Parameters:

Name Type Description Default
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional configuration parameters

{}

Returns:

Type Description
WPPopulationConfig

Configured WPPopulationConfig instance

Source code in gigaspatial/handlers/worldpop.py
def create_config(
    self, data_store: DataStore, logger: logging.Logger, **kwargs
) -> WPPopulationConfig:
    """
    Create and return a WPPopulationConfig instance.

    Args:
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional configuration parameters

    Returns:
        Configured WPPopulationConfig instance
    """
    return WPPopulationConfig(
        project=self._project,
        year=self._year,
        resolution=self._resolution,
        un_adjusted=self._un_adjusted,
        constrained=self._constrained,
        school_age=self._school_age,
        data_store=data_store,
        logger=logger,
        **kwargs,
    )
create_downloader(config, data_store, logger, **kwargs)

Create and return a WPPopulationDownloader instance.

Parameters:

Name Type Description Default
config WPPopulationConfig

The configuration object

required
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional downloader parameters

{}

Returns:

Type Description
WPPopulationDownloader

Configured WPPopulationDownloader instance

Source code in gigaspatial/handlers/worldpop.py
def create_downloader(
    self,
    config: WPPopulationConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> WPPopulationDownloader:
    """
    Create and return a WPPopulationDownloader instance.

    Args:
        config: The configuration object
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional downloader parameters

    Returns:
        Configured WPPopulationDownloader instance
    """
    return WPPopulationDownloader(
        config=config, data_store=data_store, logger=logger, **kwargs
    )
create_reader(config, data_store, logger, **kwargs)

Create and return a WPPopulationReader instance.

Parameters:

Name Type Description Default
config WPPopulationConfig

The configuration object

required
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional reader parameters

{}

Returns:

Type Description
WPPopulationReader

Configured WPPopulationReader instance

Source code in gigaspatial/handlers/worldpop.py
def create_reader(
    self,
    config: WPPopulationConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> WPPopulationReader:
    """
    Create and return a WPPopulationReader instance.

    Args:
        config: The configuration object
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional reader parameters

    Returns:
        Configured WPPopulationReader instance
    """
    return WPPopulationReader(
        config=config, data_store=data_store, logger=logger, **kwargs
    )
load_into_dataframe(source, ensure_available=True, **kwargs)

Load GHSL data into a pandas DataFrame.

Parameters:

Name Type Description Default
source str

The data source specification

required
ensure_available bool

If True, ensure data is downloaded before loading

True
**kwargs

Additional parameters passed to load methods

{}

Returns:

Type Description
DataFrame

DataFrame containing the GHSL data

Source code in gigaspatial/handlers/worldpop.py
def load_into_dataframe(
    self,
    source: str,
    ensure_available: bool = True,
    **kwargs,
) -> pd.DataFrame:
    """
    Load GHSL data into a pandas DataFrame.

    Args:
        source: The data source specification
        ensure_available: If True, ensure data is downloaded before loading
        **kwargs: Additional parameters passed to load methods

    Returns:
        DataFrame containing the GHSL data
    """
    tif_processors = self.load_data(
        source=source, ensure_available=ensure_available, **kwargs
    )
    if isinstance(tif_processors, TifProcessor):
        return tif_processors.to_dataframe(**kwargs)

    return pd.concat(
        [tp.to_dataframe(**kwargs) for tp in tif_processors], ignore_index=True
    )
load_into_geodataframe(source, ensure_available=True, **kwargs)

Load GHSL data into a geopandas GeoDataFrame.

Parameters:

Name Type Description Default
source str

The data source specification

required
ensure_available bool

If True, ensure data is downloaded before loading

True
**kwargs

Additional parameters passed to load methods

{}

Returns:

Type Description
GeoDataFrame

GeoDataFrame containing the GHSL data

Source code in gigaspatial/handlers/worldpop.py
def load_into_geodataframe(
    self,
    source: str,
    ensure_available: bool = True,
    **kwargs,
) -> gpd.GeoDataFrame:
    """
    Load GHSL data into a geopandas GeoDataFrame.

    Args:
        source: The data source specification
        ensure_available: If True, ensure data is downloaded before loading
        **kwargs: Additional parameters passed to load methods

    Returns:
        GeoDataFrame containing the GHSL data
    """
    tif_processors = self.load_data(
        source=source, ensure_available=ensure_available, **kwargs
    )
    if isinstance(tif_processors, TifProcessor):
        return tif_processors.to_geodataframe(**kwargs)

    return pd.concat(
        [tp.to_geodataframe(**kwargs) for tp in tif_processors], ignore_index=True
    )

WPPopulationReader

Bases: BaseHandlerReader

Source code in gigaspatial/handlers/worldpop.py
class WPPopulationReader(BaseHandlerReader):

    def __init__(
        self,
        config: Union[WPPopulationConfig, dict[str, Union[str, int]]],
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        """
        Initialize the reader.

        Args:
            config: Configuration for the WorldPop dataset, either as a WPPopulationConfig object or a dictionary of parameters
            data_store: Optional data storage interface. If not provided, uses LocalDataStore.
            logger: Optional custom logger. If not provided, uses default logger.
        """
        config = (
            config
            if isinstance(config, WPPopulationConfig)
            else WPPopulationConfig(**config)
        )
        super().__init__(config=config, data_store=data_store, logger=logger)

    def load_from_paths(
        self,
        source_data_path: List[Union[str, Path]],
        merge_rasters: bool = False,
        **kwargs,
    ) -> Union[List[TifProcessor], TifProcessor]:
        """
        Load TifProcessors of WP datasets.
        Args:
            source_data_path: List of file paths to load
            merge_rasters: If True, all rasters will be merged into a single TifProcessor.
                           Defaults to False.
        Returns:
            Union[List[TifProcessor], TifProcessor]: List of TifProcessor objects for accessing the raster data or a single
                                                    TifProcessor if merge_rasters is True.
        """
        # Apply deferred age/sex filters if present and applicable
        if (
            hasattr(self.config, "_temp_age_sex_filters")
            and self.config.project == "age_structures"
            and not self.config.school_age
        ):
            # Ensure source_data_path is a list of Path objects for consistent filtering
            source_data_path = [
                Path(p) if isinstance(p, str) else p for p in source_data_path
            ]
            filtered_paths = self.config._filter_age_sex_paths(
                source_data_path, self.config._temp_age_sex_filters
            )
            # Clear the temporary filter after use
            del self.config._temp_age_sex_filters
            if not filtered_paths:
                self.logger.warning(
                    "No WorldPop age_structures paths matched the applied filters."
                )
                return []  # Return empty list if no paths after filtering
            source_data_path = filtered_paths

        return self._load_raster_data(
            raster_paths=source_data_path, merge_rasters=merge_rasters
        )

    def load(self, source, merge_rasters: bool = False, **kwargs):
        return super().load(source=source, merge_rasters=merge_rasters, **kwargs)
__init__(config, data_store=None, logger=None)

Initialize the reader.

Parameters:

Name Type Description Default
config Union[WPPopulationConfig, dict[str, Union[str, int]]]

Configuration for the WorldPop dataset, either as a WPPopulationConfig object or a dictionary of parameters

required
data_store Optional[DataStore]

Optional data storage interface. If not provided, uses LocalDataStore.

None
logger Optional[Logger]

Optional custom logger. If not provided, uses default logger.

None
Source code in gigaspatial/handlers/worldpop.py
def __init__(
    self,
    config: Union[WPPopulationConfig, dict[str, Union[str, int]]],
    data_store: Optional[DataStore] = None,
    logger: Optional[logging.Logger] = None,
):
    """
    Initialize the reader.

    Args:
        config: Configuration for the WorldPop dataset, either as a WPPopulationConfig object or a dictionary of parameters
        data_store: Optional data storage interface. If not provided, uses LocalDataStore.
        logger: Optional custom logger. If not provided, uses default logger.
    """
    config = (
        config
        if isinstance(config, WPPopulationConfig)
        else WPPopulationConfig(**config)
    )
    super().__init__(config=config, data_store=data_store, logger=logger)
load_from_paths(source_data_path, merge_rasters=False, **kwargs)

Load TifProcessors of WP datasets. Args: source_data_path: List of file paths to load merge_rasters: If True, all rasters will be merged into a single TifProcessor. Defaults to False. Returns: Union[List[TifProcessor], TifProcessor]: List of TifProcessor objects for accessing the raster data or a single TifProcessor if merge_rasters is True.

Source code in gigaspatial/handlers/worldpop.py
def load_from_paths(
    self,
    source_data_path: List[Union[str, Path]],
    merge_rasters: bool = False,
    **kwargs,
) -> Union[List[TifProcessor], TifProcessor]:
    """
    Load TifProcessors of WP datasets.
    Args:
        source_data_path: List of file paths to load
        merge_rasters: If True, all rasters will be merged into a single TifProcessor.
                       Defaults to False.
    Returns:
        Union[List[TifProcessor], TifProcessor]: List of TifProcessor objects for accessing the raster data or a single
                                                TifProcessor if merge_rasters is True.
    """
    # Apply deferred age/sex filters if present and applicable
    if (
        hasattr(self.config, "_temp_age_sex_filters")
        and self.config.project == "age_structures"
        and not self.config.school_age
    ):
        # Ensure source_data_path is a list of Path objects for consistent filtering
        source_data_path = [
            Path(p) if isinstance(p, str) else p for p in source_data_path
        ]
        filtered_paths = self.config._filter_age_sex_paths(
            source_data_path, self.config._temp_age_sex_filters
        )
        # Clear the temporary filter after use
        del self.config._temp_age_sex_filters
        if not filtered_paths:
            self.logger.warning(
                "No WorldPop age_structures paths matched the applied filters."
            )
            return []  # Return empty list if no paths after filtering
        source_data_path = filtered_paths

    return self._load_raster_data(
        raster_paths=source_data_path, merge_rasters=merge_rasters
    )

WorldPopRestClient

REST API client for WorldPop data access.

This class provides direct access to the WorldPop REST API without any configuration dependencies, allowing flexible integration patterns.

Source code in gigaspatial/handlers/worldpop.py
class WorldPopRestClient:
    """
    REST API client for WorldPop data access.

    This class provides direct access to the WorldPop REST API without any
    configuration dependencies, allowing flexible integration patterns.
    """

    def __init__(
        self,
        base_url: str = "https://www.worldpop.org/rest/data",
        stats_url: str = "https://api.worldpop.org/v1/services/stats",
        api_key: Optional[str] = None,
        timeout: int = 30,
        logger: Optional[logging.Logger] = None,
    ):
        """
        Initialize the WorldPop REST API client.

        Args:
            base_url: Base URL for the WorldPop REST API
            stats_url: URL for the WorldPop statistics API
            api_key: Optional API key for higher rate limits
            timeout: Request timeout in seconds
            logger: Optional logger instance
        """
        self.base_url = base_url.rstrip("/")
        self.stats_url = stats_url.rstrip("/")
        self.api_key = api_key
        self.timeout = timeout
        self.logger = logger or logging.getLogger(self.__class__.__name__)

        # Setup session with default headers
        self.session = requests.Session()
        self.session.headers.update(
            {"Accept": "application/json", "User-Agent": "WorldPop-Python-Client/1.0"}
        )

        if self.api_key:
            self.session.headers["X-API-Key"] = self.api_key

    def get_available_projects(self) -> List[Dict[str, Any]]:
        """
        Get list of all available projects (e.g., population, births, pregnancies, etc.).

        Returns:
            List of project dictionaries with alias, name, title, and description
        """
        try:
            response = self.session.get(self.base_url, timeout=self.timeout)
            response.raise_for_status()
            data = response.json()
            return data.get("data", [])
        except requests.RequestException as e:
            self.logger.error(f"Failed to fetch available project aliases: {e}")
            return []

    def get_project_sources(self, dataset_type: str) -> List[Dict[str, Any]]:
        """
        Get available sources for a specific project type.

        Args:
            dataset_type: Project type alias (e.g., 'pop', 'births', 'pregnancies')

        Returns:
            List of source dictionaries with alias and name
        """
        try:
            url = f"{self.base_url}/{dataset_type}"
            response = self.session.get(url, timeout=self.timeout)
            response.raise_for_status()
            data = response.json()
            return data.get("data", [])
        except requests.RequestException as e:
            self.logger.error(
                f"Failed to fetch project sources for {dataset_type}: {e}"
            )
            return []

    def get_source_entities(
        self, dataset_type: str, category: str
    ) -> List[Dict[str, Any]]:
        """
        Get list of entities (countries, global, continental) available for a specific project type and source.

        Args:
            dataset_type: Project type alias (e.g., 'pop', 'births')
            category: Source alias (e.g., 'wpgp', 'pic')

        Returns:
            List of entity dictionaries with id and iso3 codes (if applicable)
        """
        try:
            url = f"{self.base_url}/{dataset_type}/{category}"
            response = self.session.get(url, timeout=self.timeout)
            response.raise_for_status()
            data = response.json()
            return data.get("data", [])
        except requests.RequestException as e:
            self.logger.error(
                f"Failed to fetch entities for {dataset_type}/{category}: {e}"
            )
            return []

    def get_datasets(self, dataset_type: str, category: str, params: dict):
        """
        Get all datasets available for the params.

        Args:
            dataset_type: Dataset type alias (e.g., 'pop', 'births')
            category: Category alias (e.g., 'wpgp', 'pic')
            params: Query parameters (e.g., {'iso3`:'RWA'})

        Returns:
            List of dataset dictionaries with metadata and file information
        """
        try:
            url = f"{self.base_url}/{dataset_type}/{category}"
            response = self.session.get(url, params=params, timeout=self.timeout)
            response.raise_for_status()
            data = response.json()
            return data.get("data", [])
        except requests.RequestException as e:
            self.logger.error(f"Failed to fetch datasets for {params}: {e}")
            return []

    def get_datasets_by_country(
        self, dataset_type: str, category: str, iso3: str
    ) -> List[Dict[str, Any]]:
        """
        Get all datasets available for a specific country.

        Args:
            dataset_type: Dataset type alias (e.g., 'pop', 'births')
            category: Category alias (e.g., 'wpgp', 'pic')
            iso3: ISO3 country code (e.g., 'USA', 'BRA')

        Returns:
            List of dataset dictionaries with metadata and file information
        """
        params = {"iso3": iso3}
        return self.get_datasets(dataset_type, category, params)

    def get_dataset_by_id(
        self, dataset_type: str, category: str, dataset_id: str
    ) -> Optional[Dict[str, Any]]:
        """
        Get dataset information by ID.

        Args:
            dataset_type: Dataset type alias (e.g., 'pop', 'births')
            category: Category alias (e.g., 'wpgp', 'pic')
            dataset_id: Dataset ID

        Returns:
            Dataset dictionary or None if not found
        """
        params = {"id": dataset_id}
        return self.get_datasets(dataset_type, category, params)

    def find_dataset(
        self,
        dataset_type: str,
        category: str,
        iso3: str,
        year: Union[str, int],
        **filters,
    ) -> Optional[Dict[str, Any]]:
        """
        Find a specific dataset by year and optional filters.

        Args:
            dataset_type: Dataset type alias
            category: Category alias
            iso3: ISO3 country code
            year: Year to search for
            **filters: Additional filters (e.g., gender='F', resolution='1km')

        Returns:
            Dataset dictionary or None if not found
        """
        datasets = self.get_country_datasets(dataset_type, category, iso3)
        year_str = str(year)

        for dataset in datasets:
            if dataset.get("popyear") == year_str:
                # Check additional filters
                match = True
                for key, value in filters.items():
                    if key in dataset and dataset[key] != value:
                        match = False
                        break

                if match:
                    return dataset

        return None

    def list_years_for_country(
        self, dataset_type: str, category: str, iso3: str
    ) -> List[int]:
        """
        List all available years for a specific country and dataset.

        Args:
            dataset_type: Dataset type alias
            category: Category alias
            iso3: ISO3 country code

        Returns:
            Sorted list of available years
        """
        datasets = self.get_datasets_by_country(dataset_type, category, iso3)
        years = []

        for dataset in datasets:
            try:
                year = int(dataset.get("popyear", 0))
                if year > 0:
                    years.append(year)
            except (ValueError, TypeError):
                continue

        return sorted(years)

    def search_datasets(
        self,
        dataset_type: Optional[str] = None,
        category: Optional[str] = None,
        iso3: Optional[str] = None,
        year: Optional[Union[str, int]] = None,
        **filters,
    ) -> List[Dict[str, Any]]:
        """
        Search for datasets with flexible filtering.

        Args:
            dataset_type: Optional dataset type filter
            category: Optional category filter
            iso3: Optional country filter
            year: Optional year filter
            **filters: Additional filters

        Returns:
            List of matching datasets
        """
        results = []

        if dataset_type:
            if category:
                # If we have country-specific filters
                if iso3:
                    datasets = self.get_datasets_by_country(
                        dataset_type, category, iso3
                    )
                    for dataset in datasets:
                        match = True

                        # Check year filter
                        if year and dataset.get("popyear") != str(year):
                            match = False

                        # Check additional filters
                        for key, value in filters.items():
                            if key in dataset and dataset[key] != value:
                                match = False
                                break

                        if match:
                            results.append(dataset)
                else:
                    return self.get_source_entities(dataset_type, category)
            else:
                return self.get_project_sources(dataset_type)
        else:
            return self.get_available_projects()

        return results

    def get_dataset_info(self, dataset: Dict[str, Any]) -> Dict[str, Any]:
        """
        Extract useful information from a dataset dictionary.

        Args:
            dataset: Dataset dictionary from API

        Returns:
            Cleaned dataset information
        """
        return {
            "id": dataset.get("id"),
            "title": dataset.get("title"),
            "description": dataset.get("desc"),
            "doi": dataset.get("doi"),
            "citation": dataset.get("citation"),
            "data_format": dataset.get("data_format"),
            "year": dataset.get("popyear"),
            "country": dataset.get("country"),
            "iso3": dataset.get("iso3"),
            "continent": dataset.get("continent"),
            "download_urls": dataset.get("files", []),
            "image_url": dataset.get("url_img"),
            "summary_url": dataset.get("url_summary"),
            "license": dataset.get("license"),
            "organization": dataset.get("organisation"),
            "author": dataset.get("author_name"),
            "maintainer": dataset.get("maintainer_name"),
            "project": dataset.get("project"),
            "category": dataset.get("category"),
            "date_created": dataset.get("date"),
            "public": dataset.get("public") == "Y",
            "archived": dataset.get("archive") == "Y",
        }

    def close(self):
        """Close the session."""
        self.session.close()

    def __enter__(self):
        """Context manager entry."""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit."""
        self.close()
__enter__()

Context manager entry.

Source code in gigaspatial/handlers/worldpop.py
def __enter__(self):
    """Context manager entry."""
    return self
__exit__(exc_type, exc_val, exc_tb)

Context manager exit.

Source code in gigaspatial/handlers/worldpop.py
def __exit__(self, exc_type, exc_val, exc_tb):
    """Context manager exit."""
    self.close()
__init__(base_url='https://www.worldpop.org/rest/data', stats_url='https://api.worldpop.org/v1/services/stats', api_key=None, timeout=30, logger=None)

Initialize the WorldPop REST API client.

Parameters:

Name Type Description Default
base_url str

Base URL for the WorldPop REST API

'https://www.worldpop.org/rest/data'
stats_url str

URL for the WorldPop statistics API

'https://api.worldpop.org/v1/services/stats'
api_key Optional[str]

Optional API key for higher rate limits

None
timeout int

Request timeout in seconds

30
logger Optional[Logger]

Optional logger instance

None
Source code in gigaspatial/handlers/worldpop.py
def __init__(
    self,
    base_url: str = "https://www.worldpop.org/rest/data",
    stats_url: str = "https://api.worldpop.org/v1/services/stats",
    api_key: Optional[str] = None,
    timeout: int = 30,
    logger: Optional[logging.Logger] = None,
):
    """
    Initialize the WorldPop REST API client.

    Args:
        base_url: Base URL for the WorldPop REST API
        stats_url: URL for the WorldPop statistics API
        api_key: Optional API key for higher rate limits
        timeout: Request timeout in seconds
        logger: Optional logger instance
    """
    self.base_url = base_url.rstrip("/")
    self.stats_url = stats_url.rstrip("/")
    self.api_key = api_key
    self.timeout = timeout
    self.logger = logger or logging.getLogger(self.__class__.__name__)

    # Setup session with default headers
    self.session = requests.Session()
    self.session.headers.update(
        {"Accept": "application/json", "User-Agent": "WorldPop-Python-Client/1.0"}
    )

    if self.api_key:
        self.session.headers["X-API-Key"] = self.api_key
close()

Close the session.

Source code in gigaspatial/handlers/worldpop.py
def close(self):
    """Close the session."""
    self.session.close()
find_dataset(dataset_type, category, iso3, year, **filters)

Find a specific dataset by year and optional filters.

Parameters:

Name Type Description Default
dataset_type str

Dataset type alias

required
category str

Category alias

required
iso3 str

ISO3 country code

required
year Union[str, int]

Year to search for

required
**filters

Additional filters (e.g., gender='F', resolution='1km')

{}

Returns:

Type Description
Optional[Dict[str, Any]]

Dataset dictionary or None if not found

Source code in gigaspatial/handlers/worldpop.py
def find_dataset(
    self,
    dataset_type: str,
    category: str,
    iso3: str,
    year: Union[str, int],
    **filters,
) -> Optional[Dict[str, Any]]:
    """
    Find a specific dataset by year and optional filters.

    Args:
        dataset_type: Dataset type alias
        category: Category alias
        iso3: ISO3 country code
        year: Year to search for
        **filters: Additional filters (e.g., gender='F', resolution='1km')

    Returns:
        Dataset dictionary or None if not found
    """
    datasets = self.get_country_datasets(dataset_type, category, iso3)
    year_str = str(year)

    for dataset in datasets:
        if dataset.get("popyear") == year_str:
            # Check additional filters
            match = True
            for key, value in filters.items():
                if key in dataset and dataset[key] != value:
                    match = False
                    break

            if match:
                return dataset

    return None
get_available_projects()

Get list of all available projects (e.g., population, births, pregnancies, etc.).

Returns:

Type Description
List[Dict[str, Any]]

List of project dictionaries with alias, name, title, and description

Source code in gigaspatial/handlers/worldpop.py
def get_available_projects(self) -> List[Dict[str, Any]]:
    """
    Get list of all available projects (e.g., population, births, pregnancies, etc.).

    Returns:
        List of project dictionaries with alias, name, title, and description
    """
    try:
        response = self.session.get(self.base_url, timeout=self.timeout)
        response.raise_for_status()
        data = response.json()
        return data.get("data", [])
    except requests.RequestException as e:
        self.logger.error(f"Failed to fetch available project aliases: {e}")
        return []
get_dataset_by_id(dataset_type, category, dataset_id)

Get dataset information by ID.

Parameters:

Name Type Description Default
dataset_type str

Dataset type alias (e.g., 'pop', 'births')

required
category str

Category alias (e.g., 'wpgp', 'pic')

required
dataset_id str

Dataset ID

required

Returns:

Type Description
Optional[Dict[str, Any]]

Dataset dictionary or None if not found

Source code in gigaspatial/handlers/worldpop.py
def get_dataset_by_id(
    self, dataset_type: str, category: str, dataset_id: str
) -> Optional[Dict[str, Any]]:
    """
    Get dataset information by ID.

    Args:
        dataset_type: Dataset type alias (e.g., 'pop', 'births')
        category: Category alias (e.g., 'wpgp', 'pic')
        dataset_id: Dataset ID

    Returns:
        Dataset dictionary or None if not found
    """
    params = {"id": dataset_id}
    return self.get_datasets(dataset_type, category, params)
get_dataset_info(dataset)

Extract useful information from a dataset dictionary.

Parameters:

Name Type Description Default
dataset Dict[str, Any]

Dataset dictionary from API

required

Returns:

Type Description
Dict[str, Any]

Cleaned dataset information

Source code in gigaspatial/handlers/worldpop.py
def get_dataset_info(self, dataset: Dict[str, Any]) -> Dict[str, Any]:
    """
    Extract useful information from a dataset dictionary.

    Args:
        dataset: Dataset dictionary from API

    Returns:
        Cleaned dataset information
    """
    return {
        "id": dataset.get("id"),
        "title": dataset.get("title"),
        "description": dataset.get("desc"),
        "doi": dataset.get("doi"),
        "citation": dataset.get("citation"),
        "data_format": dataset.get("data_format"),
        "year": dataset.get("popyear"),
        "country": dataset.get("country"),
        "iso3": dataset.get("iso3"),
        "continent": dataset.get("continent"),
        "download_urls": dataset.get("files", []),
        "image_url": dataset.get("url_img"),
        "summary_url": dataset.get("url_summary"),
        "license": dataset.get("license"),
        "organization": dataset.get("organisation"),
        "author": dataset.get("author_name"),
        "maintainer": dataset.get("maintainer_name"),
        "project": dataset.get("project"),
        "category": dataset.get("category"),
        "date_created": dataset.get("date"),
        "public": dataset.get("public") == "Y",
        "archived": dataset.get("archive") == "Y",
    }
get_datasets(dataset_type, category, params)

Get all datasets available for the params.

Parameters:

Name Type Description Default
dataset_type str

Dataset type alias (e.g., 'pop', 'births')

required
category str

Category alias (e.g., 'wpgp', 'pic')

required
params dict

Query parameters (e.g., {'iso3`:'RWA'})

required

Returns:

Type Description

List of dataset dictionaries with metadata and file information

Source code in gigaspatial/handlers/worldpop.py
def get_datasets(self, dataset_type: str, category: str, params: dict):
    """
    Get all datasets available for the params.

    Args:
        dataset_type: Dataset type alias (e.g., 'pop', 'births')
        category: Category alias (e.g., 'wpgp', 'pic')
        params: Query parameters (e.g., {'iso3`:'RWA'})

    Returns:
        List of dataset dictionaries with metadata and file information
    """
    try:
        url = f"{self.base_url}/{dataset_type}/{category}"
        response = self.session.get(url, params=params, timeout=self.timeout)
        response.raise_for_status()
        data = response.json()
        return data.get("data", [])
    except requests.RequestException as e:
        self.logger.error(f"Failed to fetch datasets for {params}: {e}")
        return []
get_datasets_by_country(dataset_type, category, iso3)

Get all datasets available for a specific country.

Parameters:

Name Type Description Default
dataset_type str

Dataset type alias (e.g., 'pop', 'births')

required
category str

Category alias (e.g., 'wpgp', 'pic')

required
iso3 str

ISO3 country code (e.g., 'USA', 'BRA')

required

Returns:

Type Description
List[Dict[str, Any]]

List of dataset dictionaries with metadata and file information

Source code in gigaspatial/handlers/worldpop.py
def get_datasets_by_country(
    self, dataset_type: str, category: str, iso3: str
) -> List[Dict[str, Any]]:
    """
    Get all datasets available for a specific country.

    Args:
        dataset_type: Dataset type alias (e.g., 'pop', 'births')
        category: Category alias (e.g., 'wpgp', 'pic')
        iso3: ISO3 country code (e.g., 'USA', 'BRA')

    Returns:
        List of dataset dictionaries with metadata and file information
    """
    params = {"iso3": iso3}
    return self.get_datasets(dataset_type, category, params)
get_project_sources(dataset_type)

Get available sources for a specific project type.

Parameters:

Name Type Description Default
dataset_type str

Project type alias (e.g., 'pop', 'births', 'pregnancies')

required

Returns:

Type Description
List[Dict[str, Any]]

List of source dictionaries with alias and name

Source code in gigaspatial/handlers/worldpop.py
def get_project_sources(self, dataset_type: str) -> List[Dict[str, Any]]:
    """
    Get available sources for a specific project type.

    Args:
        dataset_type: Project type alias (e.g., 'pop', 'births', 'pregnancies')

    Returns:
        List of source dictionaries with alias and name
    """
    try:
        url = f"{self.base_url}/{dataset_type}"
        response = self.session.get(url, timeout=self.timeout)
        response.raise_for_status()
        data = response.json()
        return data.get("data", [])
    except requests.RequestException as e:
        self.logger.error(
            f"Failed to fetch project sources for {dataset_type}: {e}"
        )
        return []
get_source_entities(dataset_type, category)

Get list of entities (countries, global, continental) available for a specific project type and source.

Parameters:

Name Type Description Default
dataset_type str

Project type alias (e.g., 'pop', 'births')

required
category str

Source alias (e.g., 'wpgp', 'pic')

required

Returns:

Type Description
List[Dict[str, Any]]

List of entity dictionaries with id and iso3 codes (if applicable)

Source code in gigaspatial/handlers/worldpop.py
def get_source_entities(
    self, dataset_type: str, category: str
) -> List[Dict[str, Any]]:
    """
    Get list of entities (countries, global, continental) available for a specific project type and source.

    Args:
        dataset_type: Project type alias (e.g., 'pop', 'births')
        category: Source alias (e.g., 'wpgp', 'pic')

    Returns:
        List of entity dictionaries with id and iso3 codes (if applicable)
    """
    try:
        url = f"{self.base_url}/{dataset_type}/{category}"
        response = self.session.get(url, timeout=self.timeout)
        response.raise_for_status()
        data = response.json()
        return data.get("data", [])
    except requests.RequestException as e:
        self.logger.error(
            f"Failed to fetch entities for {dataset_type}/{category}: {e}"
        )
        return []
list_years_for_country(dataset_type, category, iso3)

List all available years for a specific country and dataset.

Parameters:

Name Type Description Default
dataset_type str

Dataset type alias

required
category str

Category alias

required
iso3 str

ISO3 country code

required

Returns:

Type Description
List[int]

Sorted list of available years

Source code in gigaspatial/handlers/worldpop.py
def list_years_for_country(
    self, dataset_type: str, category: str, iso3: str
) -> List[int]:
    """
    List all available years for a specific country and dataset.

    Args:
        dataset_type: Dataset type alias
        category: Category alias
        iso3: ISO3 country code

    Returns:
        Sorted list of available years
    """
    datasets = self.get_datasets_by_country(dataset_type, category, iso3)
    years = []

    for dataset in datasets:
        try:
            year = int(dataset.get("popyear", 0))
            if year > 0:
                years.append(year)
        except (ValueError, TypeError):
            continue

    return sorted(years)
search_datasets(dataset_type=None, category=None, iso3=None, year=None, **filters)

Search for datasets with flexible filtering.

Parameters:

Name Type Description Default
dataset_type Optional[str]

Optional dataset type filter

None
category Optional[str]

Optional category filter

None
iso3 Optional[str]

Optional country filter

None
year Optional[Union[str, int]]

Optional year filter

None
**filters

Additional filters

{}

Returns:

Type Description
List[Dict[str, Any]]

List of matching datasets

Source code in gigaspatial/handlers/worldpop.py
def search_datasets(
    self,
    dataset_type: Optional[str] = None,
    category: Optional[str] = None,
    iso3: Optional[str] = None,
    year: Optional[Union[str, int]] = None,
    **filters,
) -> List[Dict[str, Any]]:
    """
    Search for datasets with flexible filtering.

    Args:
        dataset_type: Optional dataset type filter
        category: Optional category filter
        iso3: Optional country filter
        year: Optional year filter
        **filters: Additional filters

    Returns:
        List of matching datasets
    """
    results = []

    if dataset_type:
        if category:
            # If we have country-specific filters
            if iso3:
                datasets = self.get_datasets_by_country(
                    dataset_type, category, iso3
                )
                for dataset in datasets:
                    match = True

                    # Check year filter
                    if year and dataset.get("popyear") != str(year):
                        match = False

                    # Check additional filters
                    for key, value in filters.items():
                        if key in dataset and dataset[key] != value:
                            match = False
                            break

                    if match:
                        results.append(dataset)
            else:
                return self.get_source_entities(dataset_type, category)
        else:
            return self.get_project_sources(dataset_type)
    else:
        return self.get_available_projects()

    return results